diff --git a/.github/workflows/collect-fix-commits.yml b/.github/workflows/collect-fix-commits.yml new file mode 100644 index 0000000..370350b --- /dev/null +++ b/.github/workflows/collect-fix-commits.yml @@ -0,0 +1,37 @@ +name: Hourly sync for collecting fix commits + +on: + workflow_dispatch: + schedule: + - cron: '0 * * * *' + +permissions: + contents: write + +jobs: + scheduled: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.10' + + - name: Install required packages + run: pip install GitPython==3.1.46 packageurl-python==0.17.6 aboutcode.pipeline==0.2.1 + + - name: Run sync + run: python fix_commits_collector.py + + - name: Commit and push if it changed + run: |- + git config user.name "AboutCode Automation" + git config user.email "automation@aboutcode.org" + git add -A + timestamp=$(date -u) + git commit -m "$(echo -e "Sync Collecting Fix Commits: $timestamp\n\nSigned-off-by: AboutCode Automation ")" || exit 0 + git push \ No newline at end of file diff --git a/.github/workflows/collect-issues-prs.yml b/.github/workflows/collect-issues-prs.yml new file mode 100644 index 0000000..2a3df7b --- /dev/null +++ b/.github/workflows/collect-issues-prs.yml @@ -0,0 +1,40 @@ +name: Hourly sync for collecting issues and pull requests + +on: + workflow_dispatch: + schedule: + - cron: '0 * * * *' + +permissions: + contents: write + +jobs: + scheduled: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.10' + + - name: Install required packages + run: pip install PyGithub==2.8.1 python-dotenv==1.2.2 packageurl-python==0.17.6 python-gitlab==8.1.0 aboutcode.pipeline==0.2.1 + + - name: Run sync + env: + GH_API_TOKEN: ${{ secrets.GH_API_TOKEN }} + GLAB_API_TOKEN: ${{ secrets.GLAB_API_TOKEN }} + run: python issues_prs_collector.py + + - name: Commit and push if it changed + run: |- + git config user.name "AboutCode Automation" + git config user.email "automation@aboutcode.org" + git add -A + timestamp=$(date -u) + git commit -m "$(echo -e "Sync Collecting Issues and Pull requests related to vulnerabilities: $timestamp\n\nSigned-off-by: AboutCode Automation ")" || exit 0 + git push \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..f10862a --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/.env diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..02020d6 --- /dev/null +++ b/LICENSE @@ -0,0 +1,427 @@ +Attribution-ShareAlike 4.0 International + +======================================================================= + +Creative Commons Corporation ("Creative Commons") is not a law firm and +does not provide legal services or legal advice. Distribution of +Creative Commons public licenses does not create a lawyer-client or +other relationship. Creative Commons makes its licenses and related +information available on an "as-is" basis. Creative Commons gives no +warranties regarding its licenses, any material licensed under their +terms and conditions, or any related information. Creative Commons +disclaims all liability for damages resulting from their use to the +fullest extent possible. + +Using Creative Commons Public Licenses + +Creative Commons public licenses provide a standard set of terms and +conditions that creators and other rights holders may use to share +original works of authorship and other material subject to copyright +and certain other rights specified in the public license below. The +following considerations are for informational purposes only, are not +exhaustive, and do not form part of our licenses. + + Considerations for licensors: Our public licenses are + intended for use by those authorized to give the public + permission to use material in ways otherwise restricted by + copyright and certain other rights. Our licenses are + irrevocable. Licensors should read and understand the terms + and conditions of the license they choose before applying it. + Licensors should also secure all rights necessary before + applying our licenses so that the public can reuse the + material as expected. Licensors should clearly mark any + material not subject to the license. This includes other CC- + licensed material, or material used under an exception or + limitation to copyright. More considerations for licensors: + wiki.creativecommons.org/Considerations_for_licensors + + Considerations for the public: By using one of our public + licenses, a licensor grants the public permission to use the + licensed material under specified terms and conditions. If + the licensor's permission is not necessary for any reason--for + example, because of any applicable exception or limitation to + copyright--then that use is not regulated by the license. Our + licenses grant only permissions under copyright and certain + other rights that a licensor has authority to grant. Use of + the licensed material may still be restricted for other + reasons, including because others have copyright or other + rights in the material. A licensor may make special requests, + such as asking that all changes be marked or described. + Although not required by our licenses, you are encouraged to + respect those requests where reasonable. More considerations + for the public: + wiki.creativecommons.org/Considerations_for_licensees + +======================================================================= + +Creative Commons Attribution-ShareAlike 4.0 International Public +License + +By exercising the Licensed Rights (defined below), You accept and agree +to be bound by the terms and conditions of this Creative Commons +Attribution-ShareAlike 4.0 International Public License ("Public +License"). To the extent this Public License may be interpreted as a +contract, You are granted the Licensed Rights in consideration of Your +acceptance of these terms and conditions, and the Licensor grants You +such rights in consideration of benefits the Licensor receives from +making the Licensed Material available under these terms and +conditions. + + +Section 1 -- Definitions. + + a. Adapted Material means material subject to Copyright and Similar + Rights that is derived from or based upon the Licensed Material + and in which the Licensed Material is translated, altered, + arranged, transformed, or otherwise modified in a manner requiring + permission under the Copyright and Similar Rights held by the + Licensor. For purposes of this Public License, where the Licensed + Material is a musical work, performance, or sound recording, + Adapted Material is always produced where the Licensed Material is + synched in timed relation with a moving image. + + b. Adapter's License means the license You apply to Your Copyright + and Similar Rights in Your contributions to Adapted Material in + accordance with the terms and conditions of this Public License. + + c. BY-SA Compatible License means a license listed at + creativecommons.org/compatiblelicenses, approved by Creative + Commons as essentially the equivalent of this Public License. + + d. Copyright and Similar Rights means copyright and/or similar rights + closely related to copyright including, without limitation, + performance, broadcast, sound recording, and Sui Generis Database + Rights, without regard to how the rights are labeled or + categorized. For purposes of this Public License, the rights + specified in Section 2(b)(1)-(2) are not Copyright and Similar + Rights. + + e. Effective Technological Measures means those measures that, in the + absence of proper authority, may not be circumvented under laws + fulfilling obligations under Article 11 of the WIPO Copyright + Treaty adopted on December 20, 1996, and/or similar international + agreements. + + f. Exceptions and Limitations means fair use, fair dealing, and/or + any other exception or limitation to Copyright and Similar Rights + that applies to Your use of the Licensed Material. + + g. License Elements means the license attributes listed in the name + of a Creative Commons Public License. The License Elements of this + Public License are Attribution and ShareAlike. + + h. Licensed Material means the artistic or literary work, database, + or other material to which the Licensor applied this Public + License. + + i. Licensed Rights means the rights granted to You subject to the + terms and conditions of this Public License, which are limited to + all Copyright and Similar Rights that apply to Your use of the + Licensed Material and that the Licensor has authority to license. + + j. Licensor means the individual(s) or entity(ies) granting rights + under this Public License. + + k. Share means to provide material to the public by any means or + process that requires permission under the Licensed Rights, such + as reproduction, public display, public performance, distribution, + dissemination, communication, or importation, and to make material + available to the public including in ways that members of the + public may access the material from a place and at a time + individually chosen by them. + + l. Sui Generis Database Rights means rights other than copyright + resulting from Directive 96/9/EC of the European Parliament and of + the Council of 11 March 1996 on the legal protection of databases, + as amended and/or succeeded, as well as other essentially + equivalent rights anywhere in the world. + + m. You means the individual or entity exercising the Licensed Rights + under this Public License. Your has a corresponding meaning. + + +Section 2 -- Scope. + + a. License grant. + + 1. Subject to the terms and conditions of this Public License, + the Licensor hereby grants You a worldwide, royalty-free, + non-sublicensable, non-exclusive, irrevocable license to + exercise the Licensed Rights in the Licensed Material to: + + a. reproduce and Share the Licensed Material, in whole or + in part; and + + b. produce, reproduce, and Share Adapted Material. + + 2. Exceptions and Limitations. For the avoidance of doubt, where + Exceptions and Limitations apply to Your use, this Public + License does not apply, and You do not need to comply with + its terms and conditions. + + 3. Term. The term of this Public License is specified in Section + 6(a). + + 4. Media and formats; technical modifications allowed. The + Licensor authorizes You to exercise the Licensed Rights in + all media and formats whether now known or hereafter created, + and to make technical modifications necessary to do so. The + Licensor waives and/or agrees not to assert any right or + authority to forbid You from making technical modifications + necessary to exercise the Licensed Rights, including + technical modifications necessary to circumvent Effective + Technological Measures. For purposes of this Public License, + simply making modifications authorized by this Section 2(a) + (4) never produces Adapted Material. + + 5. Downstream recipients. + + a. Offer from the Licensor -- Licensed Material. Every + recipient of the Licensed Material automatically + receives an offer from the Licensor to exercise the + Licensed Rights under the terms and conditions of this + Public License. + + b. Additional offer from the Licensor -- Adapted Material. + Every recipient of Adapted Material from You + automatically receives an offer from the Licensor to + exercise the Licensed Rights in the Adapted Material + under the conditions of the Adapter's License You apply. + + c. No downstream restrictions. You may not offer or impose + any additional or different terms or conditions on, or + apply any Effective Technological Measures to, the + Licensed Material if doing so restricts exercise of the + Licensed Rights by any recipient of the Licensed + Material. + + 6. No endorsement. Nothing in this Public License constitutes or + may be construed as permission to assert or imply that You + are, or that Your use of the Licensed Material is, connected + with, or sponsored, endorsed, or granted official status by, + the Licensor or others designated to receive attribution as + provided in Section 3(a)(1)(A)(i). + + b. Other rights. + + 1. Moral rights, such as the right of integrity, are not + licensed under this Public License, nor are publicity, + privacy, and/or other similar personality rights; however, to + the extent possible, the Licensor waives and/or agrees not to + assert any such rights held by the Licensor to the limited + extent necessary to allow You to exercise the Licensed + Rights, but not otherwise. + + 2. Patent and trademark rights are not licensed under this + Public License. + + 3. To the extent possible, the Licensor waives any right to + collect royalties from You for the exercise of the Licensed + Rights, whether directly or through a collecting society + under any voluntary or waivable statutory or compulsory + licensing scheme. In all other cases the Licensor expressly + reserves any right to collect such royalties. + + +Section 3 -- License Conditions. + +Your exercise of the Licensed Rights is expressly made subject to the +following conditions. + + a. Attribution. + + 1. If You Share the Licensed Material (including in modified + form), You must: + + a. retain the following if it is supplied by the Licensor + with the Licensed Material: + + i. identification of the creator(s) of the Licensed + Material and any others designated to receive + attribution, in any reasonable manner requested by + the Licensor (including by pseudonym if + designated); + + ii. a copyright notice; + + iii. a notice that refers to this Public License; + + iv. a notice that refers to the disclaimer of + warranties; + + v. a URI or hyperlink to the Licensed Material to the + extent reasonably practicable; + + b. indicate if You modified the Licensed Material and + retain an indication of any previous modifications; and + + c. indicate the Licensed Material is licensed under this + Public License, and include the text of, or the URI or + hyperlink to, this Public License. + + 2. You may satisfy the conditions in Section 3(a)(1) in any + reasonable manner based on the medium, means, and context in + which You Share the Licensed Material. For example, it may be + reasonable to satisfy the conditions by providing a URI or + hyperlink to a resource that includes the required + information. + + 3. If requested by the Licensor, You must remove any of the + information required by Section 3(a)(1)(A) to the extent + reasonably practicable. + + b. ShareAlike. + + In addition to the conditions in Section 3(a), if You Share + Adapted Material You produce, the following conditions also apply. + + 1. The Adapter's License You apply must be a Creative Commons + license with the same License Elements, this version or + later, or a BY-SA Compatible License. + + 2. You must include the text of, or the URI or hyperlink to, the + Adapter's License You apply. You may satisfy this condition + in any reasonable manner based on the medium, means, and + context in which You Share Adapted Material. + + 3. You may not offer or impose any additional or different terms + or conditions on, or apply any Effective Technological + Measures to, Adapted Material that restrict exercise of the + rights granted under the Adapter's License You apply. + + +Section 4 -- Sui Generis Database Rights. + +Where the Licensed Rights include Sui Generis Database Rights that +apply to Your use of the Licensed Material: + + a. for the avoidance of doubt, Section 2(a)(1) grants You the right + to extract, reuse, reproduce, and Share all or a substantial + portion of the contents of the database; + + b. if You include all or a substantial portion of the database + contents in a database in which You have Sui Generis Database + Rights, then the database in which You have Sui Generis Database + Rights (but not its individual contents) is Adapted Material, + + including for purposes of Section 3(b); and + c. You must comply with the conditions in Section 3(a) if You Share + all or a substantial portion of the contents of the database. + +For the avoidance of doubt, this Section 4 supplements and does not +replace Your obligations under this Public License where the Licensed +Rights include other Copyright and Similar Rights. + + +Section 5 -- Disclaimer of Warranties and Limitation of Liability. + + a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE + EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS + AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF + ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS, + IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION, + WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR + PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS, + ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT + KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT + ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU. + + b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE + TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION, + NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT, + INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES, + COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR + USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN + ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR + DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR + IN PART, THIS LIMITATION MAY NOT APPLY TO YOU. + + c. The disclaimer of warranties and limitation of liability provided + above shall be interpreted in a manner that, to the extent + possible, most closely approximates an absolute disclaimer and + waiver of all liability. + + +Section 6 -- Term and Termination. + + a. This Public License applies for the term of the Copyright and + Similar Rights licensed here. However, if You fail to comply with + this Public License, then Your rights under this Public License + terminate automatically. + + b. Where Your right to use the Licensed Material has terminated under + Section 6(a), it reinstates: + + 1. automatically as of the date the violation is cured, provided + it is cured within 30 days of Your discovery of the + violation; or + + 2. upon express reinstatement by the Licensor. + + For the avoidance of doubt, this Section 6(b) does not affect any + right the Licensor may have to seek remedies for Your violations + of this Public License. + + c. For the avoidance of doubt, the Licensor may also offer the + Licensed Material under separate terms or conditions or stop + distributing the Licensed Material at any time; however, doing so + will not terminate this Public License. + + d. Sections 1, 5, 6, 7, and 8 survive termination of this Public + License. + + +Section 7 -- Other Terms and Conditions. + + a. The Licensor shall not be bound by any additional or different + terms or conditions communicated by You unless expressly agreed. + + b. Any arrangements, understandings, or agreements regarding the + Licensed Material not stated herein are separate from and + independent of the terms and conditions of this Public License. + + +Section 8 -- Interpretation. + + a. For the avoidance of doubt, this Public License does not, and + shall not be interpreted to, reduce, limit, restrict, or impose + conditions on any use of the Licensed Material that could lawfully + be made without permission under this Public License. + + b. To the extent possible, if any provision of this Public License is + deemed unenforceable, it shall be automatically reformed to the + minimum extent necessary to make it enforceable. If the provision + cannot be reformed, it shall be severed from this Public License + without affecting the enforceability of the remaining terms and + conditions. + + c. No term or condition of this Public License will be waived and no + failure to comply consented to unless expressly agreed to by the + Licensor. + + d. Nothing in this Public License constitutes or may be interpreted + as a limitation upon, or waiver of, any privileges and immunities + that apply to the Licensor or You, including from the legal + processes of any jurisdiction or authority. + + +======================================================================= + +Creative Commons is not a party to its public +licenses. Notwithstanding, Creative Commons may elect to apply one of +its public licenses to material it publishes and in those instances +will be considered the “Licensor.” The text of the Creative Commons +public licenses is dedicated to the public domain under the CC0 Public +Domain Dedication. Except for the limited purpose of indicating that +material is shared under a Creative Commons public license or as +otherwise permitted by the Creative Commons policies published at +creativecommons.org/policies, Creative Commons does not authorize the +use of the trademark "Creative Commons" or any other trademark or logo +of Creative Commons without its prior written consent including, +without limitation, in connection with any unauthorized modifications +to any of its public licenses or any other arrangements, +understandings, or agreements concerning use of licensed material. For +the avoidance of doubt, this paragraph does not form part of the +public licenses. + +Creative Commons may be contacted at creativecommons.org. \ No newline at end of file diff --git a/README.md b/README.md index 01aa921..794bd90 100644 --- a/README.md +++ b/README.md @@ -1 +1,92 @@ -# vulnerablecode-vcs-collector \ No newline at end of file +# vulnerablecode-vcs-collector +Collect data ( fix commits , issues, prs ) related to vulnerabilities + + +#### Fix commits: +To collect fix commits we clone the target git repo and loop over every git commit message searching for ( CVE-id or GHSA-id ) + +File structure: + +```json +{ + "vcs_url": "https://github.com/mirror/busybox", + "vulnerabilities": { + "CVE-2023-42363": { + "fb08d43d44d1fea1f741fafb9aa7e1958a5f69aa": "awk: fix use after free (CVE-2023-42363)\n\nfunction old new delta\nevaluate 3377 3385 +8\n\nFixes https://bugs.busybox.net/show_bug.cgi?id=15865\n\nSigned-off-by: Natanael Copa \nSigned-off-by: Denys Vlasenko " + } + } +} +``` + +#### Issues and PRs: +To collect issues and pull requests we are using Github/Gitlab API to do quick search by `CVE-` + +File structure: + +```json +{ + "vcs_url": "https://github.com/python/cpython", + "vulnerabilities": { + "CVE-2026-2297": { + "Issues": [ + "https://github.com/python/cpython/issues/145506" + ], + "PRs": [ + "https://github.com/python/cpython/pull/145514", + "https://github.com/python/cpython/pull/145516", + "https://github.com/python/cpython/pull/145515", + "https://github.com/python/cpython/pull/145507", + "https://github.com/python/cpython/pull/145512", + "https://github.com/python/cpython/pull/145513" + ] + } + } +} +``` + +### File Naming +The results are stored in a json file `{repo_name}-{repo_url_hash}.json` ex: `nginx-9251c307.json` + +**Notes:** `repo_url_hash` represents the first 8 characters of repository url `SHA-256` hash +## Usage + +To get started, clone the repository: + +```bash +git clone https://github.com/aboutcode-data/vulnerablecode-vcs-collector.git +``` + + +Once cloned, you can find the existing data in the `data/fix-commits` or `data/issues-prs` directory + +To run the pipeline and generate new files, Create the `.env` file and add your API tokens: + +```json +GH_API_TOKEN="ghp_xxx" +GLAB_API_TOKEN="glpat-xxx" +``` + +Then, you can run the collectors using Python: + +To collect fix commits: +```bash +python fix_commits_collector.py +``` + +To collect issues and pull requests: +```bash +python issues_prs_collector.py +``` + +## Testing + +Ensure you have `pytest` installed by running this command: +```bash +pip install pytest +``` + +Then, you can run the tests using this command: +```bash +python -m pytest test/ -v +``` + diff --git a/config/fix_commits_targets.json b/config/fix_commits_targets.json new file mode 100644 index 0000000..668f6b0 --- /dev/null +++ b/config/fix_commits_targets.json @@ -0,0 +1,365 @@ +[ + "https://github.com/apache/poi", + "https://github.com/wagtail/wagtail", + "https://github.com/github/docs", + "https://github.com/kubernetes-sigs/secrets-store-csi-driver", + "https://github.com/rancher/rancher", + "https://github.com/apache/ranger", + "https://github.com/opentofu/opentofu", + "https://github.com/apache/santuario-xml-security-java", + "https://github.com/nationalsecurityagency/ghidra", + "https://github.com/squid-cache/squid", + "https://gitlab.com/gitlab-org/gitlab", + "https://github.com/jackc/pgx", + "https://github.com/apache/ignite", + "https://github.com/webpack/webpack", + "https://github.com/python/cpython", + "https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git", + "https://github.com/apache/hive", + "https://github.com/onnx/onnx", + "https://github.com/apache/jackrabbit", + "https://github.com/postgres/postgres", + "https://github.com/rust-lang/rust", + "https://gitlab.com/samba-team/samba", + "https://github.com/kubernetes-sigs/aws-iam-authenticator", + "https://github.com/apache/logging-log4net", + "https://github.com/psf/requests", + "https://github.com/swiftlang/swift", + "https://github.com/openstack/keystone", + "https://github.com/apache/hadoop", + "https://github.com/apache/mina-sshd", + "https://github.com/nixos/nixpkgs", + "https://github.com/imagemagick/imagemagick6", + "https://android.googlesource.com/platform/external/expat", + "https://bitbucket.org/b_c/jose4j", + "https://bitbucket.org/tildeslash/monit", + "https://git.kernel.org/pub/scm/utils/util-linux/util-linux.git", + "https://github.com/ansible-collections/community.general", + "https://github.com/beego/beego", + "https://github.com/apache/cordova-android", + "https://github.com/xen-project/xen", + "https://github.com/openssh/openssh-portable", + "https://github.com/denoland/std", + "https://github.com/angular/angular-cli", + "https://github.com/sveltejs/kit", + "https://github.com/bytecodealliance/wasmtime", + "https://github.com/dart-lang/http", + "https://gitlab.com/libtiff/libtiff", + "https://github.com/apache/cxf", + "https://github.com/swagger-api/swagger-ui", + "https://github.com/golang/go", + "https://gitlab.com/cryptsetup/cryptsetup", + "https://github.com/matrix-org/synapse", + "https://github.com/apache/pulsar", + "https://git.kernel.org/pub/scm/fs/xfs/xfs-linux.git", + "https://github.com/kubernetes/kubernetes", + "https://github.com/videolan/vlc", + "https://github.com/apache/struts", + "https://github.com/netdata/netdata", + "https://github.com/apache/maven", + "https://android.googlesource.com/platform/external/conscrypt", + "https://github.com/labstack/echo", + "https://github.com/openstack/nova", + "https://github.com/qemu/qemu", + "https://github.com/django/django", + "https://android.googlesource.com/platform/external/libjpeg-turbo", + "https://github.com/nltk/nltk", + "https://github.com/pytorch/pytorch", + "https://github.com/mirror/busybox", + "https://github.com/weechat/weechat", + "https://github.com/moby/moby", + "https://android.googlesource.com/platform/external/skia", + "https://github.com/apache/avro", + "https://github.com/twisted/twisted", + "https://github.com/pyca/pyopenssl", + "https://github.com/oven-sh/bun", + "https://github.com/capstone-engine/capstone", + "https://github.com/apache/atlas", + "https://github.com/psf/black", + "https://git.kernel.org/pub/scm/linux/kernel/git/davem/net.git", + "https://github.com/apache/rocketmq", + "https://github.com/lxc/lxc", + "https://github.com/matplotlib/matplotlib", + "https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git", + "https://android.googlesource.com/platform/art", + "https://github.com/dart-lang/sdk", + "https://chromium.googlesource.com/webm/libwebp", + "https://github.com/sequelize/sequelize", + "https://github.com/opencv/opencv", + "https://github.com/apache/activemq-artemis", + "https://android.googlesource.com/platform/external/okhttp", + "https://github.com/openssl/openssl", + "https://github.com/pypa/virtualenv", + "https://github.com/apache/commons-beanutils", + "https://github.com/redis/redis-py", + "https://android.googlesource.com/platform/external/webkit", + "https://github.com/backstage/backstage", + "https://github.com/apache/beam", + "https://github.com/mongodb/mongo-go-driver", + "https://github.com/apache/kafka", + "https://github.com/apache/skywalking", + "https://github.com/apache/tomcat", + "https://github.com/vercel/next.js", + "https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git", + "https://github.com/netty/netty", + "https://gerrit.googlesource.com/gerrit", + "https://github.com/typeorm/typeorm", + "https://github.com/bcgit/bc-java", + "https://github.com/open-policy-agent/opa", + "https://github.com/nextcloud/android", + "https://github.com/nim-lang/nim", + "https://github.com/microsoft/playwright", + "https://github.com/ollama/ollama", + "https://gitlab.com/freetype/freetype", + "https://github.com/getsentry/sentry-java", + "https://gitlab.gnome.org/gnome/gegl", + "https://chromium.googlesource.com/chromium/src", + "https://github.com/apache/activemq", + "https://github.com/apache/logging-log4j2", + "https://android.googlesource.com/platform/build", + "https://aomedia.googlesource.com/aom", + "https://github.com/jedisct1/pure-ftpd", + "https://github.com/langchain-ai/langchain", + "https://github.com/php/php-src", + "https://git.kernel.org/pub/scm/linux/kernel/git/gregkh/driver-core.git", + "https://android.googlesource.com/platform/external/libpng", + "https://github.com/canonical/lxd", + "https://github.com/vim/vim", + "https://github.com/canonical/snapd", + "https://github.com/gnome/gnome-shell", + "https://github.com/pydantic/pydantic", + "https://github.com/matrix-org/dendrite", + "https://github.com/kubernetes/kube-state-metrics", + "https://gitlab.com/gitlab-org/gitlab-foss", + "https://github.com/openbsd/src", + "https://android.googlesource.com/platform/bionic", + "https://github.com/woocommerce/woocommerce", + "https://github.com/u-boot/u-boot", + "https://github.com/varnishcache/varnish-cache", + "https://github.com/apache/shiro", + "https://git.kernel.org/pub/scm/linux/kernel/git/gregkh/tty.git", + "https://github.com/sigstore/cosign", + "https://github.com/apache/airflow", + "https://gitlab.com/mailman/mailman", + "https://github.com/apache/ofbiz-framework", + "https://github.com/mermaid-js/mermaid", + "https://github.com/matrix-org/matrix-js-sdk", + "https://github.com/apache/orc", + "https://github.com/twbs/bootstrap", + "https://chromium.googlesource.com/chromium/blink", + "https://github.com/apache/accumulo", + "https://github.com/neovim/neovim", + "https://github.com/cakephp/cakephp", + "https://android.googlesource.com/platform/system/core", + "https://github.com/apache/commons-io", + "https://github.com/bitcoin/bitcoin", + "https://github.com/bytecodealliance/wasm-micro-runtime", + "https://github.com/apache/ant-ivy", + "https://github.com/hashicorp/terraform", + "https://github.com/apache/commons-collections", + "https://github.com/apache/pinot", + "https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git", + "https://github.com/puppetlabs/puppet", + "https://github.com/bagder/curl", + "https://github.com/getsentry/sentry-python", + "https://github.com/openstack/neutron", + "https://github.com/apache/commons-compress", + "https://gitlab.com/gstreamer/gstreamer", + "https://github.com/apache/storm", + "https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net.git", + "https://github.com/langchain-ai/langchainjs", + "https://github.com/webkit/webkit", + "https://github.com/denoland/deno", + "https://github.com/gnome/glib", + "https://github.com/apache/zookeeper", + "https://android.googlesource.com/platform/external/libvpx", + "https://gitlab.com/wireshark/wireshark", + "https://github.com/langchain-ai/langgraph", + "https://github.com/strongswan/strongswan", + "https://github.com/redis/redis", + "https://github.com/c-ares/c-ares", + "https://android.googlesource.com/platform/frameworks/base", + "https://github.com/numpy/numpy", + "https://github.com/lxc/lxd", + "https://android.googlesource.com/platform/external/boringssl", + "https://gitlab.com/libssh/libssh-mirror", + "https://github.com/apache/hbase", + "https://gitweb.gentoo.org/repo/gentoo.git", + "https://github.com/mono/mono", + "https://git.kernel.org/pub/scm/bluetooth/bluez.git", + "https://github.com/opencontainers/runc", + "https://github.com/jenkinsci/jenkins", + "https://android.googlesource.com/platform/external/freetype", + "https://github.com/apache/nifi", + "https://github.com/python-pillow/pillow", + "https://github.com/apache/arrow", + "https://github.com/apache/lucene-solr", + "https://github.com/ansible-collections/community.crypto", + "https://github.com/open-telemetry/opentelemetry-go", + "https://github.com/bitnami-labs/sealed-secrets", + "https://github.com/krb5/krb5", + "https://github.com/apache/pdfbox", + "https://github.com/quarkusio/quarkus", + "https://github.com/openstack/swift", + "https://github.com/git/git", + "https://github.com/sqlite/sqlite", + "https://github.com/imagemagick/imagemagick", + "https://github.com/mongodb/mongo", + "https://github.com/apache/subversion", + "https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf.git", + "https://github.com/apache/superset", + "https://github.com/openzfs/zfs", + "https://github.com/nixos/nix", + "https://android.googlesource.com/platform/frameworks/av", + "https://github.com/apache/flink", + "https://github.com/ray-project/ray", + "https://github.com/zulip/zulip", + "https://github.com/glennrp/libpng", + "https://github.com/moby/buildkit", + "https://android.googlesource.com/platform/external/libxml2", + "https://github.com/apache/tika", + "https://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux.git", + "https://github.com/gnome/libxml2", + "https://git.kernel.org/pub/scm/git/git.git", + "https://chromium.googlesource.com/webm/libwebm", + "https://github.com/madler/zlib", + "https://github.com/minio/minio", + "https://github.com/istio/istio", + "https://github.com/brave/brave-core", + "https://gitlab.gnome.org/gnome/libxml2", + "https://github.com/bcgit/bc-csharp", + "https://github.com/github/cmark-gfm", + "https://github.com/ruby/ruby", + "https://github.com/nmap/nmap", + "https://github.com/apache/lucene", + "https://github.com/nestjs/nest", + "https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git", + "https://github.com/mariadb/server", + "https://github.com/apache/cassandra", + "https://github.com/apache/drill", + "https://github.com/apache/druid", + "https://github.com/openvpn/openvpn", + "https://github.com/apache/archiva", + "https://github.com/ansible/ansible", + "https://github.com/apache/dolphinscheduler", + "https://github.com/apache/karaf", + "https://github.com/gnome/nautilus", + "https://github.com/apache/calcite", + "https://github.com/osquery/osquery", + "https://github.com/dbt-labs/dbt-core", + "https://git.kernel.org/pub/scm/virt/kvm/kvm.git", + "https://github.com/facebook/react", + "https://github.com/radareorg/radare2", + "https://github.com/gin-gonic/gin", + "https://gitlab.com/qemu-project/qemu", + "https://github.com/dart-lang/pub", + "https://github.com/sinatra/sinatra", + "https://github.com/apache/commons-text", + "https://gitlab.gnome.org/gnome/gdk-pixbuf", + "https://github.com/nats-io/nats-server", + "https://github.com/mastodon/mastodon", + "https://github.com/urllib3/urllib3", + "https://github.com/babel/babel", + "https://github.com/open-telemetry/opentelemetry-python-contrib", + "https://github.com/spring-projects/spring-framework", + "https://github.com/caddyserver/caddy", + "https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git", + "https://github.com/mattermost/mattermost", + "https://github.com/apache/james-project", + "https://github.com/nuxt/nuxt", + "https://github.com/kubernetes/ingress-nginx", + "https://bitbucket.org/libgd/gd-libgd", + "https://github.com/apache/httpcomponents-client", + "https://github.com/apache/flume", + "https://github.com/git-for-windows/git", + "https://github.com/apache/kylin", + "https://github.com/openjdk/jdk", + "https://github.com/bpftrace/bpftrace", + "https://gitlab.gnome.org/gnome/glib", + "https://github.com/prometheus/prometheus", + "https://bitbucket.org/snakeyaml/snakeyaml", + "https://github.com/nginx/nginx", + "https://chromium.googlesource.com/v8/v8", + "https://github.com/apache/spark", + "https://github.com/influxdata/influxdb", + "https://github.com/pypa/pip", + "https://chromium.googlesource.com/chromium/third_party/ffmpeg", + "https://android.googlesource.com/platform/libcore", + "https://github.com/apache/libcloud", + "https://github.com/pyca/cryptography", + "https://github.com/stedolan/jq", + "https://github.com/sveltejs/svelte", + "https://github.com/rabbitmq/rabbitmq-java-client", + "https://android.googlesource.com/platform/cts", + "https://github.com/apache/commons-lang", + "https://github.com/laravel/framework", + "https://github.com/apache/thrift", + "https://github.com/the-tcpdump-group/tcpdump", + "https://github.com/remix-run/react-router", + "https://gitlab.gnome.org/gnome/gimp", + "https://github.com/koajs/koa", + "https://github.com/jellyfin/jellyfin-web", + "https://github.com/borgbackup/borg", + "https://bitbucket.org/connect2id/nimbus-jose-jwt", + "https://github.com/modelcontextprotocol/python-sdk", + "https://github.com/matrix-org/matrix-react-sdk", + "https://github.com/burntsushi/ripgrep", + "https://github.com/open-telemetry/opentelemetry-collector-contrib", + "https://github.com/torvalds/linux", + "https://github.com/apache/solr", + "https://github.com/github/codeql-action", + "https://github.com/getsentry/sentry-javascript", + "https://github.com/sigstore/fulcio", + "https://github.com/apache/zeppelin", + "https://github.com/ory/hydra", + "https://github.com/buildroot/buildroot", + "https://github.com/protocolbuffers/protobuf", + "https://github.com/wireshark/wireshark", + "https://github.com/github/advisory-database", + "https://github.com/github/gh-ost", + "https://github.com/markedjs/marked", + "https://github.com/opencontainers/image-spec", + "https://github.com/nlnetlabs/unbound", + "https://github.com/mitmproxy/mitmproxy", + "https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git", + "https://github.com/micropython/micropython", + "https://github.com/strapi/strapi", + "https://github.com/rails/rails", + "https://github.com/ipython/ipython", + "https://github.com/rails/rails-html-sanitizer", + "https://github.com/apache/ant", + "https://android.googlesource.com/platform/external/pdfium", + "https://android.googlesource.com/kernel/msm", + "https://github.com/go-gitea/gitea", + "https://github.com/sigstore/sigstore", + "https://github.com/sigstore/rekor", + "https://android.googlesource.com/platform/external/sqlite", + "https://github.com/mysql/mysql-server", + "https://github.com/lxml/lxml", + "https://gitlab.com/libvirt/libvirt", + "https://github.com/getsentry/sentry", + "https://github.com/jellyfin/jellyfin", + "https://github.com/kubernetes/client-go", + "https://github.com/gnome/gimp", + "https://github.com/apache/jena", + "https://github.com/apache/groovy", + "https://github.com/streamlit/streamlit", + "https://github.com/git-lfs/git-lfs", + "https://github.com/apache/commons-fileupload", + "https://github.com/torproject/tor", + "https://github.com/canonical/cloud-init", + "https://github.com/nodejs/node", + "https://github.com/apache/httpd", + "https://github.com/containerd/containerd", + "https://github.com/apache/camel", + "https://android.googlesource.com/platform/dalvik", + "https://github.com/caolan/async", + "https://android.googlesource.com/kernel/common", + "https://github.com/bootstrap-vue/bootstrap-vue", + "https://github.com/webpack/webpack-dev-server", + "https://github.com/swagger-api/swagger-codegen", + "https://github.com/open-telemetry/opentelemetry-dotnet", + "https://github.com/tokio-rs/tracing", + "https://github.com/laravel/laravel", + "https://github.com/FFmpeg/FFmpeg" +] \ No newline at end of file diff --git a/config/issues_prs_targets.json b/config/issues_prs_targets.json new file mode 100644 index 0000000..2bf491d --- /dev/null +++ b/config/issues_prs_targets.json @@ -0,0 +1,31 @@ +[ + "https://github.com/mirror/busybox", + "https://github.com/nginx/nginx", + "https://github.com/apache/tomcat", + "https://github.com/mongodb/mongo", + "https://github.com/redis/redis", + "https://github.com/php/php-src", + "https://github.com/python/cpython", + "https://github.com/ruby/ruby", + "https://github.com/golang/go", + "https://github.com/nodejs/node", + "https://github.com/rust-lang/rust", + "https://github.com/openjdk/jdk", + "https://github.com/swiftlang/swift", + "https://github.com/django/django", + "https://github.com/rails/rails", + "https://github.com/laravel/framework", + "https://github.com/spring-projects/spring-framework", + "https://github.com/facebook/react", + "https://github.com/angular/angular", + "https://github.com/moby/moby", + "https://github.com/kubernetes/kubernetes", + "https://github.com/containerd/containerd", + "https://github.com/ansible/ansible", + "https://github.com/hashicorp/terraform", + "https://github.com/the-tcpdump-group/tcpdump", + "https://github.com/jenkinsci/jenkins", + "https://gitlab.com/gitlab-org/gitlab-foss", + "https://gitlab.com/wireshark/wireshark", + "https://gitlab.com/qemu-project/qemu" +] \ No newline at end of file diff --git a/fix_commits_collector.py b/fix_commits_collector.py new file mode 100644 index 0000000..4b32769 --- /dev/null +++ b/fix_commits_collector.py @@ -0,0 +1,143 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import hashlib +import json +import re +import shutil +import sys +import tempfile +from collections import defaultdict +from datetime import datetime, timezone +from pathlib import Path + +from aboutcode.pipeline import BasePipeline, LoopProgress +from git import Repo +from packageurl.contrib.url2purl import url2purl + + +class CollectVCSFixCommitPipeline(BasePipeline): + """ + Pipeline to collect fix commits from any git repository. + """ + + vcs_url: str + patterns: list[str] = [ + r"\bCVE-\d{4}-\d{4,19}\b", + r"GHSA-[2-9cfghjmpqrvwx]{4}-[2-9cfghjmpqrvwx]{4}-[2-9cfghjmpqrvwx]{4}", + ] + + def __init__(self, vcs_url: str, *args, **kwargs): + self.vcs_url = vcs_url + super().__init__(*args, **kwargs) + + @classmethod + def steps(cls): + return ( + cls.clone, + cls.collect_fix_commits, + cls.store_items, + cls.clean_downloads, + ) + + def log(self, message): + now_local = datetime.now(timezone.utc).astimezone() + timestamp = now_local.strftime("%Y-%m-%d %H:%M:%S.%f")[:-3] + message = f"{timestamp} {message}" + print(message) + + def clone(self): + """Clone the repository.""" + self.repo = Repo.clone_from( + url=self.vcs_url, + to_path=tempfile.mkdtemp(), + bare=True, + no_checkout=True, + multi_options=["--filter=blob:none"], + ) + + def extract_vulnerability_id(self, commit) -> list[str]: + """ + Extract vulnerability id from a commit message and returns a list of matched vulnerability IDs + """ + matches = [] + for pattern in self.patterns: + found = re.findall(pattern, commit.message, flags=re.IGNORECASE) + matches.extend(found) + return matches + + def collect_fix_commits(self): + """ + Iterate through repository commits and group them by vulnerability identifiers. + """ + self.log( + "Processing git repository fix commits (grouped by vulnerability IDs)." + ) + + self.collected_items = { + "vcs_url": self.vcs_url, + "vulnerabilities": defaultdict(dict), + } + + for commit in self.repo.iter_commits("--all"): + matched_ids = self.extract_vulnerability_id(commit) + if not matched_ids: + continue + + commit_id = commit.hexsha + commit_message = commit.message.strip() + + for vuln_id in matched_ids: + vuln_id = vuln_id.upper() + self.collected_items["vulnerabilities"][vuln_id][ + commit_id + ] = commit_message + + self.log( + f"Found {len(self.collected_items)} vulnerabilities with related commits." + ) + self.log("Finished processing all commits.") + return self.collected_items + + def store_items(self): + """Storing collected fix commits for this repository""" + self.log("Storing collected fix commits") + purl = url2purl(self.vcs_url) + + if not (purl and purl.name) or not self.collected_items.get("vulnerabilities"): + self.log("Nothing to store for collected fix commits") + return + + vcs_url_hash = hashlib.sha256(self.vcs_url.encode("utf-8")).hexdigest()[:8] + path = Path(f"data/fix-commits/{purl.name}-{vcs_url_hash}.json") + path.parent.mkdir(parents=True, exist_ok=True) + + with open(path, "w", encoding="utf-8") as f: + json.dump(self.collected_items, f, indent=2) + return + + def clean_downloads(self): + """Cleanup any temporary repository data""" + self.log("Cleaning up local repository resources") + if hasattr(self, "repo") and self.repo.working_dir: + shutil.rmtree(path=self.repo.working_dir) + + +if __name__ == "__main__": + with open("config/fix_commits_targets.json") as f: + vcs_urls = json.load(f) + + progress = LoopProgress( + total_iterations=len(vcs_urls), + logger=print, + ) + + for vcs_url in progress.iter(vcs_urls): + status_code, error_msg = CollectVCSFixCommitPipeline(vcs_url=vcs_url).execute() + print(error_msg) + + sys.exit(0) diff --git a/issues_prs_collector.py b/issues_prs_collector.py new file mode 100644 index 0000000..2242192 --- /dev/null +++ b/issues_prs_collector.py @@ -0,0 +1,167 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import hashlib +import json +import os +import re +import sys +from abc import abstractmethod +from collections import defaultdict +from datetime import datetime, timezone +from pathlib import Path + +import gitlab +from aboutcode.pipeline import BasePipeline, LoopProgress +from dotenv import load_dotenv +from github import Github +from packageurl.contrib.url2purl import url2purl + +load_dotenv() + + +class VCSCollector(BasePipeline): + """ + Pipeline to collect GitHub/GitLab issues and PRs related to vulnerabilities. + """ + + vcs_url: str + CVE_PATTERN = re.compile(r"(CVE-\d{4}-\d+)", re.IGNORECASE) + SUPPORTED_IDENTIFIERS = ["CVE-"] + + def __init__(self, vcs_url: str, purl, *args, **kwargs): + self.vcs_url = vcs_url + self.purl = purl + self.repo_name = f"{self.purl.namespace}/{self.purl.name}" + self.collected_items = { + "vcs_url": self.vcs_url, + "vulnerabilities": defaultdict(lambda: {"Issues": [], "PRs": []}), + } + super().__init__(*args, **kwargs) + + @classmethod + def steps(cls): + return ( + cls.fetch_entries, + cls.collect_items, + cls.store_items, + ) + + def log(self, message): + now_local = datetime.now(timezone.utc).astimezone() + timestamp = now_local.strftime("%Y-%m-%d %H:%M:%S.%f")[:-3] + message = f"{timestamp} {message}" + print(message) + + @abstractmethod + def fetch_entries(self): + raise NotImplementedError + + @abstractmethod + def collect_items(self): + raise NotImplementedError + + def store_items(self): + self.log("Storing collected Issues and PRs commit results") + if not self.collected_items.get("vulnerabilities"): + self.log("No collected Issues and PRs results") + return + + vcs_url_hash = hashlib.sha256(self.vcs_url.encode("utf-8")).hexdigest()[:8] + path = Path(f"data/issues-prs/{self.purl.name}-{vcs_url_hash}.json") + + path.parent.mkdir(parents=True, exist_ok=True) + with open(path, "w", encoding="utf-8") as f: + json.dump(self.collected_items, f, indent=2) + + +class GitLabCollector(VCSCollector): + def fetch_entries(self): + """Fetch Gitlab Data Entries""" + gitlab_token = os.getenv("GLAB_API_TOKEN") + + if not gitlab_token: + raise ValueError("GLAB_API_TOKEN environment variable not set properly") + + gl = gitlab.Gitlab("https://gitlab.com/", private_token=gitlab_token) + project = gl.projects.get(self.repo_name) + base_query = " ".join(self.SUPPORTED_IDENTIFIERS) + self.issues = project.search(scope="issues", search=base_query, iterator=True) + self.prs = project.search( + scope="merge_requests", search=base_query, iterator=True + ) + + def collect_items(self): + for i_type, items in [("Issues", self.issues), ("PRs", self.prs)]: + for item in items: + title = item.get("title") or "" + description = item.get("description") or "" + matches = self.CVE_PATTERN.findall(title + " " + description) + seen_urls = set() + for match in matches: + cve_id = match.upper() + url = item.get("web_url") + if not url or url in seen_urls: + continue + + self.collected_items["vulnerabilities"][cve_id][i_type].append(url) + seen_urls.add(url) + + +class GitHubCollector(VCSCollector): + def fetch_entries(self): + """Fetch GitHub Data Entries""" + github_token = os.getenv("GH_API_TOKEN") + if not github_token: + raise ValueError("GH_API_TOKEN environment variable not set properly") + + g = Github(login_or_token=github_token) + base_query = ( + f"repo:{self.repo_name} ({' OR '.join(self.SUPPORTED_IDENTIFIERS)})" + ) + self.issues = g.search_issues(f"{base_query} is:issue") + self.prs = g.search_issues(f"{base_query} is:pr") + + def collect_items(self): + for i_type, items in [("Issues", self.issues), ("PRs", self.prs)]: + for item in items: + matches = self.CVE_PATTERN.findall(item.title + " " + (item.body or "")) + seen_urls = set() + for match in matches: + cve_id = match.upper() + if not item.html_url or item.html_url in seen_urls: + continue + self.collected_items["vulnerabilities"][cve_id][i_type].append( + item.html_url + ) + seen_urls.add(item.html_url) + + +if __name__ == "__main__": + with open("config/issues_prs_targets.json") as f: + vcs_urls = json.load(f) + + progress = LoopProgress( + total_iterations=len(vcs_urls), + logger=print, + ) + for vcs_url in progress.iter(vcs_urls): + purl = url2purl(vcs_url) + purl_type = purl.type + + if purl_type == "gitlab": + collector = GitLabCollector(vcs_url=vcs_url, purl=purl) + elif purl_type == "github": + collector = GitHubCollector(vcs_url=vcs_url, purl=purl) + else: + print(f"Unsupported VCS URL: {vcs_url}") + continue + + status_code, error_msg = collector.execute() + print(error_msg) + + sys.exit(0) diff --git a/test/test_fix_commits.py b/test/test_fix_commits.py new file mode 100644 index 0000000..19d1fbb --- /dev/null +++ b/test/test_fix_commits.py @@ -0,0 +1,71 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +from unittest.mock import MagicMock + +import pytest + +from fix_commits_collector import CollectVCSFixCommitPipeline + + +class TestCollectVCSFixCommitPipeline: + def test_collect_fix_commits(self): + vcs_url = "https://github.com/aboutcode-org/test" + pipeline = CollectVCSFixCommitPipeline(vcs_url=vcs_url) + + pipeline.repo = MagicMock() + commit_1 = MagicMock( + hexsha="dd7769fbc97c84545579cebf1dc4838214098a11", + message=" fixes cve-2023-40024 \n", + ) + commit_2 = MagicMock( + hexsha="ab801c46c0b0e8b921f690ea47c927379e8862a3", + message="Update README file", + ) + commit_3 = MagicMock( + hexsha="ab801c46c0b0e8b921f690ea47c927379e8862a3", + message="Patch CVE-2026-21711 and GHSA-vcqx-cqfc-xc2r", + ) + + pipeline.repo.iter_commits.return_value = [commit_1, commit_2, commit_3] + result = pipeline.collect_fix_commits() + + assert result["vcs_url"] == vcs_url + assert result["vulnerabilities"] == { + "CVE-2023-40024": { + "dd7769fbc97c84545579cebf1dc4838214098a11": "fixes cve-2023-40024" + }, + "CVE-2026-21711": { + "ab801c46c0b0e8b921f690ea47c927379e8862a3": "Patch CVE-2026-21711 and GHSA-vcqx-cqfc-xc2r" + }, + "GHSA-VCQX-CQFC-XC2R": { + "ab801c46c0b0e8b921f690ea47c927379e8862a3": "Patch CVE-2026-21711 and GHSA-vcqx-cqfc-xc2r" + }, + } + + +@pytest.mark.parametrize( + "commit_message, expected_matches", + [ + ("Update README.md with instructions", []), + ("Fixes CVE-2023-12345 in the backend", ["CVE-2023-12345"]), + ("Fix GHSA-2ggp-cmvm-f62f here", ["GHSA-2ggp-cmvm-f62f"]), + ( + "fixes cve-2026-21711 and ghsa-vcqx-cqfc-xc2r", + ["cve-2026-21711", "ghsa-vcqx-cqfc-xc2r"], + ), + ("Fix CVE-2020-123456789gff0", []), + ], +) +def test_extract_vulnerability_id(commit_message, expected_matches): + pipeline = CollectVCSFixCommitPipeline( + vcs_url="https://github.com/aboutcode-org/test" + ) + commit = MagicMock() + commit.message = commit_message + result = pipeline.extract_vulnerability_id(commit) + assert set(result) == set(expected_matches) diff --git a/test/test_issues_prs.py b/test/test_issues_prs.py new file mode 100644 index 0000000..2ea3cf0 --- /dev/null +++ b/test/test_issues_prs.py @@ -0,0 +1,105 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +from unittest.mock import MagicMock, patch + +import pytest +from packageurl import PackageURL + +from issues_prs_collector import GitHubCollector, GitLabCollector + + +class TestGitHubCollector: + def setup_method(self): + purl = PackageURL(type="github", namespace="aboutcode-org", name="test") + self.github_collector = GitHubCollector( + vcs_url="https://github.com/aboutcode-org/test", purl=purl + ) + + @patch("os.getenv", return_value=None) + def test_missing_token(self, mock_getenv): + with pytest.raises( + ValueError, match="GH_API_TOKEN environment variable not set properly" + ): + self.github_collector.fetch_entries() + + def test_collect_items(self): + issue1 = MagicMock() + issue1.title = "Fix CVE-2024-1234" + issue1.body = "test description" + issue1.html_url = "https://github.com/aboutcode-org/test/issues/1" + + pr1 = MagicMock() + pr1.title = "Bump deps" + pr1.body = "Fixes CVE-2024-5678" + pr1.html_url = "https://github.com/aboutcode-org/test/pulls/1" + + self.github_collector.issues = [issue1] + self.github_collector.prs = [pr1] + + self.github_collector.collect_items() + + assert self.github_collector.collected_items["vulnerabilities"] == { + "CVE-2024-1234": { + "Issues": ["https://github.com/aboutcode-org/test/issues/1"], + "PRs": [], + }, + "CVE-2024-5678": { + "Issues": [], + "PRs": ["https://github.com/aboutcode-org/test/pulls/1"], + }, + } + + +class TestGitLabCollector: + def setup_method(self): + purl = PackageURL(type="gitlab", namespace="gitlab-org", name="gitlab-foss") + self.gitlab_collector = GitLabCollector( + vcs_url="https://gitlab.com/gitlab-org/gitlab-foss", purl=purl + ) + + @patch("os.getenv", return_value=None) + def test_missing_token(self, mock_getenv): + with pytest.raises( + ValueError, match="GLAB_API_TOKEN environment variable not set properly" + ): + self.gitlab_collector.fetch_entries() + + def test_collect_items(self): + self.gitlab_collector.issues = [ + { + "title": "Need security update for CVE-2018-11235", + "description": "At the end of May, a severe security vulnerability was discovered in Git that pertains to submodules..", + "web_url": "https://gitlab.com/gitlab-org/gitlab-foss/-/issues/29992", + }, + { + "title": "Bump KaTeX version", + "description": "No cve here", + "web_url": "https://gitlab.com/gitlab-org/gitlab-foss/-/issues/51065", + }, + ] + self.gitlab_collector.prs = [ + { + "title": "Temporarily ignore Nokogiri CVE-2016-4658", + "description": "we can't do anything about it quickly, so we'll ignore the CVE in bundle-audit.", + "web_url": "https://gitlab.com/gitlab-org/gitlab-foss/-/merge_requests/10218", + } + ] + + self.gitlab_collector.collect_items() + assert self.gitlab_collector.collected_items["vulnerabilities"] == { + "CVE-2018-11235": { + "Issues": ["https://gitlab.com/gitlab-org/gitlab-foss/-/issues/29992"], + "PRs": [], + }, + "CVE-2016-4658": { + "Issues": [], + "PRs": [ + "https://gitlab.com/gitlab-org/gitlab-foss/-/merge_requests/10218" + ], + }, + }