From 9f060b09741ab299f5bb7aded90a31dd33344c1a Mon Sep 17 00:00:00 2001 From: allsmog Date: Mon, 30 Mar 2026 10:02:31 -0700 Subject: [PATCH] [pysrc2cpg] Expand dependency resolution and import resolution - Support pyproject.toml (PEP 621 and Poetry) and setup.cfg for dependency parsing - Support flexible version specifiers (>=, <=, ~=, etc.) in requirements.txt - Add __init__.py package import support - Add star import expansion via module cache lookup --- .../pysrc2cpg/ConfigFileCreationPass.scala | 4 +- .../DependenciesFromRequirementsTxtPass.scala | 183 ++++++++++++++++-- .../passes/DependencyPassTests.scala | 143 ++++++++++++++ .../PythonImportResolverPassTests.scala | 125 ++++++++++++ .../pysrc2cpg/PythonImportResolverPass.scala | 26 +++ 5 files changed, 459 insertions(+), 22 deletions(-) create mode 100644 joern-cli/frontends/pysrc2cpg/src/test/scala/io/joern/pysrc2cpg/passes/DependencyPassTests.scala create mode 100644 joern-cli/frontends/pysrc2cpg/src/test/scala/io/joern/pysrc2cpg/passes/PythonImportResolverPassTests.scala diff --git a/joern-cli/frontends/pysrc2cpg/src/main/scala/io/joern/pysrc2cpg/ConfigFileCreationPass.scala b/joern-cli/frontends/pysrc2cpg/src/main/scala/io/joern/pysrc2cpg/ConfigFileCreationPass.scala index 08e9728a80eb..48e9c994a7a2 100644 --- a/joern-cli/frontends/pysrc2cpg/src/main/scala/io/joern/pysrc2cpg/ConfigFileCreationPass.scala +++ b/joern-cli/frontends/pysrc2cpg/src/main/scala/io/joern/pysrc2cpg/ConfigFileCreationPass.scala @@ -23,7 +23,9 @@ class ConfigFileCreationPass(cpg: Cpg, requirementsTxt: String = "requirement.tx pathEndFilter(requirementsTxt), // Pipfile pathEndFilter("Pipfile"), - pathEndFilter("Pipfile.lock") + pathEndFilter("Pipfile.lock"), + // setup.cfg + pathEndFilter("setup.cfg") ) } diff --git a/joern-cli/frontends/pysrc2cpg/src/main/scala/io/joern/pysrc2cpg/DependenciesFromRequirementsTxtPass.scala b/joern-cli/frontends/pysrc2cpg/src/main/scala/io/joern/pysrc2cpg/DependenciesFromRequirementsTxtPass.scala index 01a932277c44..3337968ccf11 100644 --- a/joern-cli/frontends/pysrc2cpg/src/main/scala/io/joern/pysrc2cpg/DependenciesFromRequirementsTxtPass.scala +++ b/joern-cli/frontends/pysrc2cpg/src/main/scala/io/joern/pysrc2cpg/DependenciesFromRequirementsTxtPass.scala @@ -8,32 +8,173 @@ import org.slf4j.{Logger, LoggerFactory} import scala.util.matching.Regex -// This pass takes information out of specific CONFIG_FILE nodes in order to add DEPENDENCY nodes to the graph. -/* -example of a `requirements.txt` file that is valid for the pass: -``` -click==7.1.2 -Flask==1.1.2 -itsdangerous==1.1.0 -Jinja2==2.11.3 -MarkupSafe==1.1.1 -Werkzeug==1.0.1 -``` - */ +/** This pass takes information out of specific CONFIG_FILE nodes in order to add DEPENDENCY nodes to the graph. + * + * Supports: + * - requirements.txt (with all PEP 440 version specifiers, extras, environment markers) + * - pyproject.toml (PEP 621 `[project.dependencies]` and Poetry `[tool.poetry.dependencies]`) + * - setup.cfg (`[options] install_requires`) + */ class DependenciesFromRequirementsTxtPass(cpg: Cpg) extends CpgPass(cpg) { private val logger: Logger = LoggerFactory.getLogger(classOf[DependenciesFromRequirementsTxtPass]) + + /** Regex for requirements.txt lines: package name (with optional extras), optional version specifier, optional env + * marker. Supports: ==, >=, <=, ~=, !=, >, < specifiers. + */ + private val RequirementsLinePattern: Regex = + """^\s*([A-Za-z0-9][\w.\-]*)(?:\[[^\]]*\])?\s*(?:(~=|==|!=|>=|<=|>|<)\s*([^\s;,#]+))?\s*(?:;.*)?(?:#.*)?$""".r + override def run(dstGraph: DiffGraphBuilder): Unit = { - cpg.configFile.filter(_.name.endsWith("requirements.txt")).foreach { node => - val lines = node.content.split("\n") - lines.filter(_.matches("^[^=]+==[^=]+$")).foreach { line => - val keyValPattern: Regex = "^([^=]+)==([^=]+)$".r - for (patternMatch <- keyValPattern.findAllMatchIn(line)) { - val name = patternMatch.group(1) - val version = patternMatch.group(2) - val node = NewDependency().name(name).version(version).dependencyGroupId(name) - dstGraph.addNode(node) + cpg.configFile.foreach { node => + val name = node.name + if (name.endsWith("requirements.txt")) { + parseRequirementsTxt(node.content, dstGraph) + } else if (name.endsWith("pyproject.toml")) { + parsePyprojectToml(node.content, dstGraph) + } else if (name.endsWith("setup.cfg")) { + parseSetupCfg(node.content, dstGraph) + } + } + } + + private def parseRequirementsTxt(content: String, dstGraph: DiffGraphBuilder): Unit = { + content.split("\n").foreach { rawLine => + val line = rawLine.trim + // Skip empty lines, comments, includes (-r, -c), and option flags (--) + if (line.nonEmpty && !line.startsWith("#") && !line.startsWith("-r ") && !line.startsWith("-c ") && !line + .startsWith("--")) { + line match { + case RequirementsLinePattern(pkgName, specifier, version) => + val depVersion = Option(version).getOrElse("") + val dep = NewDependency().name(pkgName.trim).version(depVersion).dependencyGroupId(pkgName.trim) + dstGraph.addNode(dep) + case _ => // skip lines that don't match (e.g., URLs, editable installs) + } + } + } + } + + private def parsePyprojectToml(content: String, dstGraph: DiffGraphBuilder): Unit = { + // Try PEP 621 [project] dependencies first + parsePep621Dependencies(content, dstGraph) + // Try Poetry [tool.poetry.dependencies] + parsePoetryDependencies(content, dstGraph) + } + + /** Parse PEP 621 style: [project] dependencies = ["flask>=2.0", "requests"] */ + private def parsePep621Dependencies(content: String, dstGraph: DiffGraphBuilder): Unit = { + val lines = content.split("\n") + var inProject = false + var inDepArray = false + val depLines = scala.collection.mutable.ArrayBuffer[String]() + + for (line <- lines) { + val trimmed = line.trim + if (trimmed.startsWith("[") && trimmed.endsWith("]")) { + if (inDepArray) inDepArray = false + inProject = trimmed == "[project]" + } else if (inProject && trimmed.startsWith("dependencies")) { + // Could be single-line or multi-line array + val afterEquals = trimmed.dropWhile(_ != '=').drop(1).trim + if (afterEquals.startsWith("[")) { + if (afterEquals.contains("]")) { + // Single-line array + extractQuotedStrings(afterEquals).foreach(depLines.addOne) + } else { + inDepArray = true + } + } + } else if (inDepArray) { + if (trimmed.startsWith("]")) { + inDepArray = false + } else { + extractQuotedStrings(trimmed).foreach(depLines.addOne) } } } + + depLines.foreach(depStr => parseRequirementString(depStr, dstGraph)) + } + + /** Parse Poetry style: [tool.poetry.dependencies] flask = "^2.0" requests = {version = "^2.28", optional = true} */ + private def parsePoetryDependencies(content: String, dstGraph: DiffGraphBuilder): Unit = { + val lines = content.split("\n") + var inPoetryDeps = false + + for (line <- lines) { + val trimmed = line.trim + if (trimmed.startsWith("[") && trimmed.endsWith("]")) { + inPoetryDeps = trimmed == "[tool.poetry.dependencies]" + } else if (inPoetryDeps && trimmed.contains("=") && !trimmed.startsWith("#")) { + val parts = trimmed.split("=", 2) + val pkgName = parts(0).trim + if (pkgName != "python" && pkgName.nonEmpty) { + val versionPart = parts(1).trim.stripPrefix("\"").stripSuffix("\"") + // Handle table syntax {version = "^1.0", ...} + val version = if (versionPart.startsWith("{")) { + val versionMatch = """version\s*=\s*"([^"]+)"""".r.findFirstMatchIn(versionPart) + versionMatch.map(_.group(1)).getOrElse("") + } else { + versionPart + } + val dep = NewDependency().name(pkgName).version(version).dependencyGroupId(pkgName) + dstGraph.addNode(dep) + } + } + } + } + + /** Parse setup.cfg [options] install_requires = flask>=2.0 requests */ + private def parseSetupCfg(content: String, dstGraph: DiffGraphBuilder): Unit = { + val lines = content.split("\n") + var inOptions = false + var inInstallReqs = false + var foundFirstIndent = false + + for (line <- lines) { + val trimmed = line.trim + if (trimmed.startsWith("[") && trimmed.endsWith("]")) { + inOptions = trimmed == "[options]" + inInstallReqs = false + foundFirstIndent = false + } else if (inOptions && trimmed.startsWith("install_requires")) { + inInstallReqs = true + // Check if there are deps on the same line after '=' + val afterEquals = trimmed.dropWhile(_ != '=').drop(1).trim + if (afterEquals.nonEmpty) { + parseRequirementString(afterEquals, dstGraph) + } + } else if (inInstallReqs) { + // Continuation lines must be indented + if (line.nonEmpty && (line.startsWith(" ") || line.startsWith("\t"))) { + if (trimmed.nonEmpty && !trimmed.startsWith("#")) { + parseRequirementString(trimmed, dstGraph) + } + } else { + inInstallReqs = false + } + } + } + } + + /** Parse a single requirement string like "flask>=2.0" or "requests" into a dependency node. */ + private def parseRequirementString(reqStr: String, dstGraph: DiffGraphBuilder): Unit = { + val cleaned = reqStr.stripPrefix("\"").stripSuffix("\"").stripPrefix("'").stripSuffix("'").trim + .replaceAll(",\\s*$", "") // strip trailing comma + if (cleaned.nonEmpty) { + cleaned match { + case RequirementsLinePattern(pkgName, _, version) => + val depVersion = Option(version).getOrElse("") + val dep = NewDependency().name(pkgName.trim).version(depVersion).dependencyGroupId(pkgName.trim) + dstGraph.addNode(dep) + case _ => // skip + } + } + } + + private def extractQuotedStrings(s: String): Seq[String] = { + """"([^"]+)"|'([^']+)'""".r.findAllMatchIn(s).map { m => + Option(m.group(1)).getOrElse(m.group(2)) + }.toSeq } } diff --git a/joern-cli/frontends/pysrc2cpg/src/test/scala/io/joern/pysrc2cpg/passes/DependencyPassTests.scala b/joern-cli/frontends/pysrc2cpg/src/test/scala/io/joern/pysrc2cpg/passes/DependencyPassTests.scala new file mode 100644 index 000000000000..2963ce13c958 --- /dev/null +++ b/joern-cli/frontends/pysrc2cpg/src/test/scala/io/joern/pysrc2cpg/passes/DependencyPassTests.scala @@ -0,0 +1,143 @@ +package io.joern.pysrc2cpg.passes + +import io.joern.pysrc2cpg.testfixtures.PySrc2CpgFixture +import io.shiftleft.semanticcpg.language.* + +class DependencyPassTests extends PySrc2CpgFixture(withOssDataflow = false) { + + "requirements.txt with exact pinning (==)" should { + lazy val cpg = code( + """Flask==1.1.2 + |requests==2.28.0 + |""".stripMargin, + "requirements.txt" + ) + + "create dependency nodes with name and version" in { + val deps = cpg.dependency.l.sortBy(_.name) + deps.size shouldBe 2 + deps.head.name shouldBe "Flask" + deps.head.version shouldBe "1.1.2" + deps.last.name shouldBe "requests" + deps.last.version shouldBe "2.28.0" + } + } + + "requirements.txt with flexible specifiers" should { + lazy val cpg = code( + """flask>=2.0.0 + |requests~=2.28 + |numpy<=1.24.0 + |pandas!=1.5.0 + |scipy>1.9 + |matplotlib<3.8 + |bare-package + |# this is a comment + |-r other-requirements.txt + |--index-url https://pypi.org/simple + |package-with-extras[security]>=1.0 + |conditional-pkg>=1.0; python_version >= "3.8" + |""".stripMargin, + "requirements.txt" + ) + + "create dependency nodes for all specifier styles" in { + val deps = cpg.dependency.l + val depMap = deps.map(d => d.name -> d.version).toMap + depMap("flask") shouldBe "2.0.0" + depMap("requests") shouldBe "2.28" + depMap("numpy") shouldBe "1.24.0" + depMap("pandas") shouldBe "1.5.0" + depMap("scipy") shouldBe "1.9" + depMap("matplotlib") shouldBe "3.8" + depMap("bare-package") shouldBe "" + depMap("package-with-extras") shouldBe "1.0" + depMap("conditional-pkg") shouldBe "1.0" + } + + "skip comments, includes, and option flags" in { + val depNames = cpg.dependency.name.l.toSet + depNames should not contain "comment" + depNames should not contain "other-requirements.txt" + depNames should not contain "index-url" + } + } + + "pyproject.toml with PEP 621 dependencies" should { + lazy val cpg = code( + """[project] + |name = "my-project" + |dependencies = [ + | "flask>=2.0", + | "requests~=2.28", + | "click", + |] + | + |[tool.other] + |something = "else" + |""".stripMargin, + "pyproject.toml" + ) + + "create dependency nodes from PEP 621 format" in { + val deps = cpg.dependency.l + val depMap = deps.map(d => d.name -> d.version).toMap + depMap("flask") shouldBe "2.0" + depMap("requests") shouldBe "2.28" + depMap("click") shouldBe "" + } + } + + "pyproject.toml with Poetry dependencies" should { + lazy val cpg = code( + """[tool.poetry.dependencies] + |python = "^3.8" + |flask = "^2.0" + |requests = {version = "^2.28", optional = true} + | + |[tool.poetry.dev-dependencies] + |pytest = "^7.0" + |""".stripMargin, + "pyproject.toml" + ) + + "create dependency nodes from Poetry format" in { + val deps = cpg.dependency.l + val depNames = deps.map(_.name).toSet + depNames should contain("flask") + depNames should contain("requests") + depNames should not contain "python" + + val depMap = deps.map(d => d.name -> d.version).toMap + depMap("flask") shouldBe "^2.0" + depMap("requests") shouldBe "^2.28" + } + } + + "setup.cfg with install_requires" should { + lazy val cpg = code( + """[metadata] + |name = my-project + | + |[options] + |install_requires = + | flask>=2.0 + | requests~=2.28 + | click + | + |[options.extras_require] + |dev = pytest + |""".stripMargin, + "setup.cfg" + ) + + "create dependency nodes from setup.cfg format" in { + val deps = cpg.dependency.l + val depMap = deps.map(d => d.name -> d.version).toMap + depMap("flask") shouldBe "2.0" + depMap("requests") shouldBe "2.28" + depMap("click") shouldBe "" + } + } + +} diff --git a/joern-cli/frontends/pysrc2cpg/src/test/scala/io/joern/pysrc2cpg/passes/PythonImportResolverPassTests.scala b/joern-cli/frontends/pysrc2cpg/src/test/scala/io/joern/pysrc2cpg/passes/PythonImportResolverPassTests.scala new file mode 100644 index 000000000000..ba5dcbdeb820 --- /dev/null +++ b/joern-cli/frontends/pysrc2cpg/src/test/scala/io/joern/pysrc2cpg/passes/PythonImportResolverPassTests.scala @@ -0,0 +1,125 @@ +package io.joern.pysrc2cpg.passes + +import io.joern.pysrc2cpg.testfixtures.PySrc2CpgFixture +import io.shiftleft.semanticcpg.language.* +import io.shiftleft.semanticcpg.language.importresolver.* + +import java.io.File + +class PythonImportResolverPassTests extends PySrc2CpgFixture(withOssDataflow = false) { + + "__init__.py package imports" should { + + lazy val cpg = code( + """ + |def greet(name): + | return "Hello " + name + | + |class Greeter: + | pass + |""".stripMargin, + Seq("mypkg", "__init__.py").mkString(File.separator) + ).moreCode( + """ + |from mypkg import greet, Greeter + | + |result = greet("world") + |g = Greeter() + |""".stripMargin, + "app.py" + ) + + "resolve function import from __init__.py" in { + val resolvedImports = cpg.file(".*app.py").ast.isCall + .where(_.referencedImports) + .tag + ._toEvaluatedImport + .collect { case r: ResolvedMethod => r } + .l + resolvedImports.map(_.fullName) should contain( + Seq("mypkg", "__init__.py:.greet").mkString(File.separator) + ) + } + + "resolve type import from __init__.py" in { + val resolvedImports = cpg.file(".*app.py").ast.isCall + .where(_.referencedImports) + .tag + ._toEvaluatedImport + .collect { case r: ResolvedTypeDecl => r } + .l + resolvedImports.map(_.fullName) should contain( + Seq("mypkg", "__init__.py:.Greeter").mkString(File.separator) + ) + } + } + + "star import expansion" should { + + lazy val cpg = code( + """ + |def helper(): + | return 42 + | + |class Widget: + | pass + | + |x = 10 + |""".stripMargin, + "utils.py" + ).moreCode( + """ + |from utils import * + | + |result = helper() + |w = Widget() + |""".stripMargin, + "main.py" + ) + + "expand star import to resolve individual members" in { + val resolvedImports = cpg.file(".*main.py").ast.isCall + .where(_.referencedImports) + .tag + ._toEvaluatedImport + .l + + // Should have resolved imports for helper (method), Widget (type+constructor), and x (member) + val resolvedMethods = resolvedImports.collect { case r: ResolvedMethod => r } + val resolvedTypes = resolvedImports.collect { case r: ResolvedTypeDecl => r } + val resolvedMembers = resolvedImports.collect { case r: ResolvedMember => r } + + resolvedMethods.map(_.fullName) should contain("utils.py:.helper") + resolvedTypes.map(_.fullName) should contain("utils.py:.Widget") + resolvedMembers.map(_.memberName) should contain("x") + } + } + + "fallback for unresolvable external module" should { + + lazy val cpg = code( + """ + |from unknown_external_lib import SomeClass + | + |obj = SomeClass() + |""".stripMargin, + "consumer.py" + ) + + "create pseudo imports for external modules" in { + val resolvedImports = cpg.file(".*consumer.py").ast.isCall + .where(_.referencedImports) + .tag + ._toEvaluatedImport + .l + // For an unresolved import with a capitalized name, we expect UnknownMethod and UnknownTypeDecl + val unknownMethods = resolvedImports.collect { case u: UnknownMethod => u } + val unknownTypes = resolvedImports.collect { case u: UnknownTypeDecl => u } + + unknownMethods should not be empty + unknownTypes should not be empty + unknownTypes.head.fullName shouldBe "unknown_external_lib.py:.SomeClass" + } + } + +} diff --git a/joern-cli/frontends/x2cpg/src/main/scala/io/joern/x2cpg/frontendspecific/pysrc2cpg/PythonImportResolverPass.scala b/joern-cli/frontends/x2cpg/src/main/scala/io/joern/x2cpg/frontendspecific/pysrc2cpg/PythonImportResolverPass.scala index 532ba2271e43..560fe0cbbe18 100644 --- a/joern-cli/frontends/x2cpg/src/main/scala/io/joern/x2cpg/frontendspecific/pysrc2cpg/PythonImportResolverPass.scala +++ b/joern-cli/frontends/x2cpg/src/main/scala/io/joern/x2cpg/frontendspecific/pysrc2cpg/PythonImportResolverPass.scala @@ -72,6 +72,32 @@ class PythonImportResolverPass(cpg: Cpg) extends XImportResolverPass(cpg) { importedEntityAsFullyQualifiedImport ).filterNot(_.isBlank).mkString(".") + // Handle star imports: `from module import *` + if (importedAs == "*") { + val baseEntity = importedEntity.stripSuffix(".*") + val baseEntityAsFullyQualified = fileToPythonImportNotation(baseEntity.replaceFirst("^\\.+", "")) + val baseEntityAsRelative = Seq( + fileToPythonImportNotation(currDir.toString.stripPrefix(codeRootDir).stripPrefix(JFile.separator)), + baseEntityAsFullyQualified + ).filterNot(_.isBlank).mkString(".") + + val modulePaths = Seq(baseEntityAsRelative, baseEntityAsFullyQualified).distinct + val starImports = modulePaths.flatMap { basePath => + moduleCache.collect { + case (key, entity) if key.startsWith(basePath + ".") && !key.drop(basePath.length + 1).contains(".") => + val memberName = key.drop(basePath.length + 1) + entity.toResolvedImport(memberName) + }.flatten + } + + if (starImports.nonEmpty) { + starImports.foreach(x => evaluatedImportToTag(x, importCall, diffGraph)) + } else { + createPseudoImports(importedEntity, importedAs).map(x => evaluatedImportToTag(x, importCall, diffGraph)).l + } + return + } + // We evaluated both variations, based on what we could expect from different versions of Python and how the package // layout is interpreted by the presence of lack of `__init__.py` files. Additionally, external packages are always // fully qualified.