tracker.py: Adding new helper to read DSA data

Adding a new function parse_tracker_data() which reads the DSA data from the file taken from this URL: https://salsa.debian.org/security-tracker-team/security-tracker/-/raw/master/data/DSA/list
author: Carsten Schoenert <c.schoenert@t-online.de> 2023-11-05 10:38:33 +0100
committer: Thomas Lange <lange@debian.org> 2023-12-02 16:47:03 +0100
commit: a215df58b97423573c8e055d5017f9010a154aa7 (patch)
tree: c888499fdda4d2ab8754bbd022143a185992e29b /english/security
parent: 0b22b8f0d4e00a2cfc2b946d7e99015fbf2bea61 (diff)
1 files changed, 148 insertions, 0 deletions
diff --git a/english/security/oval/oval/parser/tracker.py b/english/security/oval/oval/parser/tracker.py
new file mode 100644
index 00000000000..58c65a2aedb
--- /dev/null
+++ b/english/security/oval/oval/parser/tracker.py
@@ -0,0 +1,148 @@
+"""
+oval.parser.tracker
+
+function around parsing data from the security-tracker data file
+
+Copyright (c) 2023 Carsten Schoenert <c.schoenert@t-online.de>
+              2023 Thomas Lange <lange@cs.uni-koeln.de>
+
+SPDX-License-Identifier: GPL-2.0-or-later
+"""
+
+import re
+from typing import Any
+
+# Dictionary containing all the regular expressions for scanning the
+# tracker data.
+# Basic idea is taken from https://www.vipinajayakumar.com/parsing-text-with-python
+regex_dicts = {
+    "DSA-DATA": re.compile(
+        r"^\[?(.+)\]\s(D[SL]A-\d+(?:-\d+)?)\s(\S+)\s(?:- )?(.+)$"
+    ),
+    # example string: [15 Sep 2023] DSA-5498-1 thunderbird - security update
+    # match groups    (  group 1  )( group 2  )             (    group 4    )
+    #                                         (  group 3  )
+    # RegEx Visualization
+    # https://regexper.com/#%5E%5C%5B%3F%28.%2B%29%5C%5D%5Cs%28D%5BSL%5DA-%5Cd%2B%28%3F%3A-%5Cd%2B%29%3F%29%5Cs%28%5CS%2B%29%5Cs%28%3F%3A-%20%29%3F%28.%2B%29%24
+    # https://regex101.com/r/Vz8tx7/1
+
+    "CVE-DATA": re.compile(r"\{([CVE0-9 -]+)}"),
+    # example string: {CVE-2004-0835 CVE-2004-0836 CVE-2004-0837}
+    # match group     (                 group 1                 )
+    #
+    # RegEx Visualization
+    # https://regexper.com/#%5C%7B%28%5BCVE0-9%20-%5D%2B%29%7D
+    # https://regex101.com/r/6MEGte/1
+
+    "RELEASE-DATA": re.compile(r"^\s+\[(\S+)]\s-\s(.+)\s(\S+)$"),
+    # example strings: 	[bullseye] - chromium 116.0.5845.140-1~deb11u1
+    #                   [bookworm] - chromium 116.0.5845.140-1~deb12u1
+    # match groups      (group 1 )  (group 2 )
+    #                                        (      group 3           )
+    # RegEx Visualization
+    # https://regexper.com/#%5E%5Cs%2B%5C%5B%28%5CS%2B%29%5D%5Cs-%5Cs%28.%2B%29%5Cs%28%5CS%2B%29%24
+    # https://regex101.com/r/S2yy8S/1
+}
+
+
+def parse_tracker_data(
+        file: str,
+        debian_version: dict[str, str]
+    ) -> dict[str, list[Any]]:
+    """Parse data from the list the Security Team is collecting
+
+    The list is provided on
+    https://salsa.debian.org/security-tracker-team/security-tracker/-/raw/master/data/DSA/list
+    and is maintained by the Security Team.
+
+    Parameters:
+        file (str): The file as source for the parsing.
+        debian_version (dict): All Debian versions as a dict.
+
+    Returns:
+        dict: All collected data as a dictionary.
+    """
+    with open(file, "r", encoding="UTF-8") as data_source:
+        raw_data = data_source.read()
+
+        # Split off the data on occurrences of '\n[' into a list.
+        # By this we get a list which has all D[L,S]A as entries.
+        all_data = re.split(
+            r"\n\[",
+            raw_data,
+        )
+        dsa_data_dict: list[Any] = []
+        final_dict: dict[str, list[Any]] = {}
+        for entry in all_data:
+            cve_list = ""
+            dsa_date = dsa_number = dsa_pkg = dsa_desc = ""
+            release: list[Any] = []
+            wml_data_dict1: dict[str, str] = {}
+            wml_data_dict2: dict[str, dict[str, Any]] = {}
+
+            # Split off the long list into separate lists which including all
+            # data from a single D[L,S]A.
+            all_lines = re.split(r"\n", entry)
+
+            for line in all_lines:
+                # Parse every line by the dict using the Regex's.
+                for key, regex in regex_dicts.items():
+                    match = regex.search(line)
+                    if match:
+                        if key == "DSA-DATA":
+                            dsa_date = match.group(1)
+                            dsa_number = match.group(2)
+                            dsa_pkg = match.group(3)
+                            dsa_desc = match.group(4)
+                            wml_data_dict1["description"] = f"{dsa_desc}"
+                            wml_data_dict1["moreinfo"] = "no info"
+
+                        if key == "CVE-DATA":
+                            cve_list = match.group(1).split(" ")
+                        if key == "RELEASE-DATA":
+                            release.append(
+                                [match.group(1),
+                                 match.group(2),
+                                 match.group(3)]
+                            )
+                            wml_data_dict2.update(
+                                {debian_version[match.group(1)]:
+                                    {"all": {match.group(2): match.group(3)}}}
+                            )
+            if not cve_list:
+                # There might be no CVE assigend yet, catching these cases.
+                cve_list = ["not yet available"]
+
+            dsa_row = (
+                f"{dsa_number} {dsa_pkg}", {
+                    "title": f"{dsa_number} {dsa_pkg}",
+                    "date": dsa_date,
+                    "packages": dsa_pkg,
+                    "secrefs": cve_list,
+                    "vulnerable": "yes",
+                    "fixed": "yes"}
+            )
+
+            final_dict[dsa_number.removesuffix("-1")] = [
+                dsa_row,
+                wml_data_dict1,
+                wml_data_dict2,
+            ]
+
+            dsa_data_dict.append(dsa_row)
+    return final_dict
+
+def test():
+    # data = parse_tracker_data(DSA_DATA_FILE)
+    for key, value in data.items():
+        # dsaResult
+        dsaResult = value[0]
+        print(f"dsaResult\n {dsaResult}")
+
+        wmlResult = value[1]
+        print(f"wmlResult1\n {wmlResult}")
+        print(f"wmlResult2\n {value[2]}")
+        dsaRef = key
+        print(f"dsaRef\n {dsaRef}")
+
+#test()
author	Carsten Schoenert <c.schoenert@t-online.de>	2023-11-05 10:38:33 +0100
committer	Thomas Lange <lange@debian.org>	2023-12-02 16:47:03 +0100
commit	a215df58b97423573c8e055d5017f9010a154aa7 (patch)
tree	c888499fdda4d2ab8754bbd022143a185992e29b /english/security
parent	0b22b8f0d4e00a2cfc2b946d7e99015fbf2bea61 (diff)