|
@@ -0,0 +1,758 @@
|
|
|
|
|
+"""
|
|
|
|
|
+Metadata extractor for autusm.
|
|
|
|
|
+
|
|
|
|
|
+This module provides functionality to extract metadata from various
|
|
|
|
|
+package configuration files like package.json, setup.py, Cargo.toml, etc.
|
|
|
|
|
+"""
|
|
|
|
|
+
|
|
|
|
|
+import os
|
|
|
|
|
+import json
|
|
|
|
|
+import logging
|
|
|
|
|
+import re
|
|
|
|
|
+import ast
|
|
|
|
|
+import toml
|
|
|
|
|
+from pathlib import Path
|
|
|
|
|
+from typing import Dict, List, Optional, Any, Tuple
|
|
|
|
|
+
|
|
|
|
|
+from .models import PackageInfo, License, LicenseCategory
|
|
|
|
|
+from .exceptions import MetadataError
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+logger = logging.getLogger(__name__)
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+class MetadataExtractor:
|
|
|
|
|
+ """Extractor for package metadata from various configuration files."""
|
|
|
|
|
+
|
|
|
|
|
+ def __init__(self):
|
|
|
|
|
+ """Initialize the metadata extractor."""
|
|
|
|
|
+ # Define file patterns for different package types
|
|
|
|
|
+ self.package_patterns = {
|
|
|
|
|
+ "python": [
|
|
|
|
|
+ "setup.py",
|
|
|
|
|
+ "pyproject.toml",
|
|
|
|
|
+ "setup.cfg",
|
|
|
|
|
+ "requirements.txt"
|
|
|
|
|
+ ],
|
|
|
|
|
+ "rust": [
|
|
|
|
|
+ "Cargo.toml",
|
|
|
|
|
+ "Cargo.lock"
|
|
|
|
|
+ ],
|
|
|
|
|
+ "node": [
|
|
|
|
|
+ "package.json",
|
|
|
|
|
+ "package-lock.json",
|
|
|
|
|
+ "npm-shrinkwrap.json"
|
|
|
|
|
+ ],
|
|
|
|
|
+ "ruby": [
|
|
|
|
|
+ "Gemfile",
|
|
|
|
|
+ "*.gemspec"
|
|
|
|
|
+ ],
|
|
|
|
|
+ "perl": [
|
|
|
|
|
+ "Makefile.PL",
|
|
|
|
|
+ "META.json",
|
|
|
|
|
+ "META.yml"
|
|
|
|
|
+ ],
|
|
|
|
|
+ "php": [
|
|
|
|
|
+ "composer.json"
|
|
|
|
|
+ ],
|
|
|
|
|
+ "java": [
|
|
|
|
|
+ "pom.xml",
|
|
|
|
|
+ "build.gradle"
|
|
|
|
|
+ ]
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ # License mapping for common license identifiers
|
|
|
|
|
+ self.license_mapping = {
|
|
|
|
|
+ "MIT": LicenseCategory.LIBRE,
|
|
|
|
|
+ "Apache-2.0": LicenseCategory.LIBRE,
|
|
|
|
|
+ "Apache-2.0": LicenseCategory.LIBRE,
|
|
|
|
|
+ "GPL-2.0": LicenseCategory.LIBRE,
|
|
|
|
|
+ "GPL-3.0": LicenseCategory.LIBRE,
|
|
|
|
|
+ "LGPL-2.1": LicenseCategory.LIBRE,
|
|
|
|
|
+ "LGPL-3.0": LicenseCategory.LIBRE,
|
|
|
|
|
+ "BSD-2-Clause": LicenseCategory.LIBRE,
|
|
|
|
|
+ "BSD-3-Clause": LicenseCategory.LIBRE,
|
|
|
|
|
+ "ISC": LicenseCategory.LIBRE,
|
|
|
|
|
+ "MPL-2.0": LicenseCategory.LIBRE,
|
|
|
|
|
+ "AGPL-3.0": LicenseCategory.LIBRE,
|
|
|
|
|
+ "BSL-1.0": LicenseCategory.LIBRE,
|
|
|
|
|
+ "Unlicense": LicenseCategory.LIBRE,
|
|
|
|
|
+ "CC0-1.0": LicenseCategory.LIBRE,
|
|
|
|
|
+ "EPL-1.0": LicenseCategory.LIBRE,
|
|
|
|
|
+ "EPL-2.0": LicenseCategory.LIBRE,
|
|
|
|
|
+ "Proprietary": LicenseCategory.PROPRIETARY,
|
|
|
|
|
+ "Commercial": LicenseCategory.PROPRIETARY
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ def extract(self, source_dir: Path) -> PackageInfo:
|
|
|
|
|
+ """Extract metadata from a source directory.
|
|
|
|
|
+
|
|
|
|
|
+ Args:
|
|
|
|
|
+ source_dir: Path to the source directory
|
|
|
|
|
+
|
|
|
|
|
+ Returns:
|
|
|
|
|
+ PackageInfo object with extracted metadata
|
|
|
|
|
+
|
|
|
|
|
+ Raises:
|
|
|
|
|
+ MetadataError: If extraction fails
|
|
|
|
|
+ """
|
|
|
|
|
+ try:
|
|
|
|
|
+ logger.info(f"Extracting metadata from {source_dir}")
|
|
|
|
|
+
|
|
|
|
|
+ package_info = PackageInfo()
|
|
|
|
|
+
|
|
|
|
|
+ # Find and process package files
|
|
|
|
|
+ package_files = self._find_package_files(source_dir)
|
|
|
|
|
+
|
|
|
|
|
+ for file_path in package_files:
|
|
|
|
|
+ relative_path = file_path.relative_to(source_dir)
|
|
|
|
|
+ package_info.metadata_files.append(str(relative_path))
|
|
|
|
|
+
|
|
|
|
|
+ # Extract metadata based on file type
|
|
|
|
|
+ if file_path.name == "package.json":
|
|
|
|
|
+ self._extract_from_package_json(file_path, package_info)
|
|
|
|
|
+ elif file_path.name == "setup.py":
|
|
|
|
|
+ self._extract_from_setup_py(file_path, package_info)
|
|
|
|
|
+ elif file_path.name == "pyproject.toml":
|
|
|
|
|
+ self._extract_from_pyproject_toml(file_path, package_info)
|
|
|
|
|
+ elif file_path.name == "Cargo.toml":
|
|
|
|
|
+ self._extract_from_cargo_toml(file_path, package_info)
|
|
|
|
|
+ elif file_path.name == "composer.json":
|
|
|
|
|
+ self._extract_from_composer_json(file_path, package_info)
|
|
|
|
|
+ elif file_path.name == "pom.xml":
|
|
|
|
|
+ self._extract_from_pom_xml(file_path, package_info)
|
|
|
|
|
+ elif file_path.name == "build.gradle":
|
|
|
|
|
+ self._extract_from_build_gradle(file_path, package_info)
|
|
|
|
|
+
|
|
|
|
|
+ # Extract additional metadata from common locations
|
|
|
|
|
+ self._extract_from_readme(source_dir, package_info)
|
|
|
|
|
+ self._extract_from_license_files(source_dir, package_info)
|
|
|
|
|
+ self._extract_from_git_info(source_dir, package_info)
|
|
|
|
|
+
|
|
|
|
|
+ # If we still don't have a name, try to derive from directory
|
|
|
|
|
+ if not package_info.name:
|
|
|
|
|
+ package_info.name = source_dir.name.lower().replace('-', '_').replace(' ', '_')
|
|
|
|
|
+
|
|
|
|
|
+ logger.info(f"Extracted metadata for package: {package_info.name}")
|
|
|
|
|
+ return package_info
|
|
|
|
|
+
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ logger.error(f"Failed to extract metadata: {e}")
|
|
|
|
|
+ raise MetadataError(f"Failed to extract metadata: {e}")
|
|
|
|
|
+
|
|
|
|
|
+ def _find_package_files(self, source_dir: Path) -> List[Path]:
|
|
|
|
|
+ """Find package configuration files in the source directory.
|
|
|
|
|
+
|
|
|
|
|
+ Args:
|
|
|
|
|
+ source_dir: Path to the source directory
|
|
|
|
|
+
|
|
|
|
|
+ Returns:
|
|
|
|
|
+ List of package configuration file paths
|
|
|
|
|
+ """
|
|
|
|
|
+ package_files = []
|
|
|
|
|
+
|
|
|
|
|
+ for root, dirs, files in os.walk(source_dir):
|
|
|
|
|
+ # Skip hidden directories and common build directories
|
|
|
|
|
+ dirs[:] = [d for d in dirs if not d.startswith('.') and d not in ['build', 'target', 'node_modules', '__pycache__']]
|
|
|
|
|
+
|
|
|
|
|
+ for file in files:
|
|
|
|
|
+ file_path = Path(root) / file
|
|
|
|
|
+
|
|
|
|
|
+ # Check if this is a package file
|
|
|
|
|
+ for package_type, patterns in self.package_patterns.items():
|
|
|
|
|
+ for pattern in patterns:
|
|
|
|
|
+ if self._match_pattern(file, pattern):
|
|
|
|
|
+ package_files.append(file_path)
|
|
|
|
|
+ break
|
|
|
|
|
+
|
|
|
|
|
+ return package_files
|
|
|
|
|
+
|
|
|
|
|
+ def _match_pattern(self, filename: str, pattern: str) -> bool:
|
|
|
|
|
+ """Check if a filename matches a pattern.
|
|
|
|
|
+
|
|
|
|
|
+ Args:
|
|
|
|
|
+ filename: Name of the file
|
|
|
|
|
+ pattern: Pattern to match (can include wildcards)
|
|
|
|
|
+
|
|
|
|
|
+ Returns:
|
|
|
|
|
+ True if the filename matches the pattern
|
|
|
|
|
+ """
|
|
|
|
|
+ if pattern.startswith("*."):
|
|
|
|
|
+ return filename.endswith(pattern[1:])
|
|
|
|
|
+ else:
|
|
|
|
|
+ return filename == pattern
|
|
|
|
|
+
|
|
|
|
|
+ def _extract_from_package_json(self, file_path: Path, package_info: PackageInfo) -> None:
|
|
|
|
|
+ """Extract metadata from package.json file.
|
|
|
|
|
+
|
|
|
|
|
+ Args:
|
|
|
|
|
+ file_path: Path to the package.json file
|
|
|
|
|
+ package_info: PackageInfo object to update
|
|
|
|
|
+ """
|
|
|
|
|
+ try:
|
|
|
|
|
+ with open(file_path, 'r', encoding='utf-8') as f:
|
|
|
|
|
+ data = json.load(f)
|
|
|
|
|
+
|
|
|
|
|
+ # Extract basic information
|
|
|
|
|
+ if not package_info.name and "name" in data:
|
|
|
|
|
+ package_info.name = data["name"]
|
|
|
|
|
+
|
|
|
|
|
+ if not package_info.version and "version" in data:
|
|
|
|
|
+ package_info.version = data["version"]
|
|
|
|
|
+
|
|
|
|
|
+ if not package_info.summary and "description" in data:
|
|
|
|
|
+ package_info.summary = data["description"]
|
|
|
|
|
+
|
|
|
|
|
+ if not package_info.url and "homepage" in data:
|
|
|
|
|
+ package_info.url = data["homepage"]
|
|
|
|
|
+ elif not package_info.url and "repository" in data:
|
|
|
|
|
+ if isinstance(data["repository"], dict) and "url" in data["repository"]:
|
|
|
|
|
+ package_info.url = data["repository"]["url"]
|
|
|
|
|
+ elif isinstance(data["repository"], str):
|
|
|
|
|
+ package_info.url = data["repository"]
|
|
|
|
|
+
|
|
|
|
|
+ # Extract author information
|
|
|
|
|
+ if "author" in data:
|
|
|
|
|
+ author = data["author"]
|
|
|
|
|
+ if isinstance(author, dict):
|
|
|
|
|
+ author_name = author.get("name", "")
|
|
|
|
|
+ if author_name:
|
|
|
|
|
+ package_info.authors.append(author_name)
|
|
|
|
|
+ elif isinstance(author, str):
|
|
|
|
|
+ package_info.authors.append(author)
|
|
|
|
|
+
|
|
|
|
|
+ # Extract license information
|
|
|
|
|
+ if "license" in data:
|
|
|
|
|
+ license_name = data["license"]
|
|
|
|
|
+ category = self._map_license_category(license_name)
|
|
|
|
|
+ package_info.licenses.append(
|
|
|
|
|
+ License(name=license_name, text="LICENSE", category=category)
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ # Extract dependencies
|
|
|
|
|
+ if "dependencies" in data:
|
|
|
|
|
+ package_info.runtime_dependencies.extend(data["dependencies"].keys())
|
|
|
|
|
+
|
|
|
|
|
+ if "devDependencies" in data:
|
|
|
|
|
+ package_info.build_dependencies.extend(data["devDependencies"].keys())
|
|
|
|
|
+
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ logger.warning(f"Failed to extract metadata from {file_path}: {e}")
|
|
|
|
|
+
|
|
|
|
|
+ def _extract_from_setup_py(self, file_path: Path, package_info: PackageInfo) -> None:
|
|
|
|
|
+ """Extract metadata from setup.py file.
|
|
|
|
|
+
|
|
|
|
|
+ Args:
|
|
|
|
|
+ file_path: Path to the setup.py file
|
|
|
|
|
+ package_info: PackageInfo object to update
|
|
|
|
|
+ """
|
|
|
|
|
+ try:
|
|
|
|
|
+ with open(file_path, 'r', encoding='utf-8') as f:
|
|
|
|
|
+ content = f.read()
|
|
|
|
|
+
|
|
|
|
|
+ # Parse the setup.py file to extract setup() arguments
|
|
|
|
|
+ tree = ast.parse(content)
|
|
|
|
|
+
|
|
|
|
|
+ for node in ast.walk(tree):
|
|
|
|
|
+ if isinstance(node, ast.Call) and hasattr(node.func, 'id') and node.func.id == 'setup':
|
|
|
|
|
+ # Extract keyword arguments
|
|
|
|
|
+ for keyword in node.keywords:
|
|
|
|
|
+ arg_name = keyword.arg
|
|
|
|
|
+ arg_value = self._extract_ast_value(keyword.value)
|
|
|
|
|
+
|
|
|
|
|
+ if arg_name == "name" and not package_info.name:
|
|
|
|
|
+ package_info.name = arg_value
|
|
|
|
|
+ elif arg_name == "version" and not package_info.version:
|
|
|
|
|
+ package_info.version = arg_value
|
|
|
|
|
+ elif arg_name == "description" and not package_info.summary:
|
|
|
|
|
+ package_info.summary = arg_value
|
|
|
|
|
+ elif arg_name == "url" and not package_info.url:
|
|
|
|
|
+ package_info.url = arg_value
|
|
|
|
|
+ elif arg_name == "author" and arg_value:
|
|
|
|
|
+ package_info.authors.append(arg_value)
|
|
|
|
|
+ elif arg_name == "license" and arg_value:
|
|
|
|
|
+ category = self._map_license_category(arg_value)
|
|
|
|
|
+ package_info.licenses.append(
|
|
|
|
|
+ License(name=arg_value, text="LICENSE", category=category)
|
|
|
|
|
+ )
|
|
|
|
|
+ elif arg_name == "install_requires" and arg_value:
|
|
|
|
|
+ if isinstance(arg_value, list):
|
|
|
|
|
+ package_info.runtime_dependencies.extend(arg_value)
|
|
|
|
|
+ elif arg_name == "setup_requires" and arg_value:
|
|
|
|
|
+ if isinstance(arg_value, list):
|
|
|
|
|
+ package_info.build_dependencies.extend(arg_value)
|
|
|
|
|
+
|
|
|
|
|
+ break
|
|
|
|
|
+
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ logger.warning(f"Failed to extract metadata from {file_path}: {e}")
|
|
|
|
|
+
|
|
|
|
|
+ def _extract_from_pyproject_toml(self, file_path: Path, package_info: PackageInfo) -> None:
|
|
|
|
|
+ """Extract metadata from pyproject.toml file.
|
|
|
|
|
+
|
|
|
|
|
+ Args:
|
|
|
|
|
+ file_path: Path to the pyproject.toml file
|
|
|
|
|
+ package_info: PackageInfo object to update
|
|
|
|
|
+ """
|
|
|
|
|
+ try:
|
|
|
|
|
+ with open(file_path, 'r', encoding='utf-8') as f:
|
|
|
|
|
+ data = toml.load(f)
|
|
|
|
|
+
|
|
|
|
|
+ # Extract from [project] section (PEP 621)
|
|
|
|
|
+ if "project" in data:
|
|
|
|
|
+ project = data["project"]
|
|
|
|
|
+
|
|
|
|
|
+ if not package_info.name and "name" in project:
|
|
|
|
|
+ package_info.name = project["name"]
|
|
|
|
|
+
|
|
|
|
|
+ if not package_info.version and "version" in project:
|
|
|
|
|
+ package_info.version = project["version"]
|
|
|
|
|
+
|
|
|
|
|
+ if not package_info.summary and "description" in project:
|
|
|
|
|
+ package_info.summary = project["description"]
|
|
|
|
|
+
|
|
|
|
|
+ if "authors" in project:
|
|
|
|
|
+ for author in project["authors"]:
|
|
|
|
|
+ if isinstance(author, dict) and "name" in author:
|
|
|
|
|
+ package_info.authors.append(author["name"])
|
|
|
|
|
+ elif isinstance(author, str):
|
|
|
|
|
+ package_info.authors.append(author)
|
|
|
|
|
+
|
|
|
|
|
+ if "license" in project:
|
|
|
|
|
+ if isinstance(project["license"], dict) and "text" in project["license"]:
|
|
|
|
|
+ license_name = project["license"]["text"]
|
|
|
|
|
+ else:
|
|
|
|
|
+ license_name = str(project["license"])
|
|
|
|
|
+
|
|
|
|
|
+ category = self._map_license_category(license_name)
|
|
|
|
|
+ package_info.licenses.append(
|
|
|
|
|
+ License(name=license_name, text="LICENSE", category=category)
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ if "dependencies" in project:
|
|
|
|
|
+ package_info.runtime_dependencies.extend(project["dependencies"])
|
|
|
|
|
+
|
|
|
|
|
+ # Extract from [tool.poetry] section
|
|
|
|
|
+ if "tool" in data and "poetry" in data["tool"]:
|
|
|
|
|
+ poetry = data["tool"]["poetry"]
|
|
|
|
|
+
|
|
|
|
|
+ if not package_info.name and "name" in poetry:
|
|
|
|
|
+ package_info.name = poetry["name"]
|
|
|
|
|
+
|
|
|
|
|
+ if not package_info.version and "version" in poetry:
|
|
|
|
|
+ package_info.version = poetry["version"]
|
|
|
|
|
+
|
|
|
|
|
+ if not package_info.summary and "description" in poetry:
|
|
|
|
|
+ package_info.summary = poetry["description"]
|
|
|
|
|
+
|
|
|
|
|
+ if "authors" in poetry:
|
|
|
|
|
+ package_info.authors.extend(poetry["authors"])
|
|
|
|
|
+
|
|
|
|
|
+ if "license" in poetry:
|
|
|
|
|
+ license_name = poetry["license"]
|
|
|
|
|
+ category = self._map_license_category(license_name)
|
|
|
|
|
+ package_info.licenses.append(
|
|
|
|
|
+ License(name=license_name, text="LICENSE", category=category)
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ if "dependencies" in poetry:
|
|
|
|
|
+ package_info.runtime_dependencies.extend(poetry["dependencies"].keys())
|
|
|
|
|
+
|
|
|
|
|
+ if "dev-dependencies" in poetry:
|
|
|
|
|
+ package_info.build_dependencies.extend(poetry["dev-dependencies"].keys())
|
|
|
|
|
+
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ logger.warning(f"Failed to extract metadata from {file_path}: {e}")
|
|
|
|
|
+
|
|
|
|
|
+ def _extract_from_cargo_toml(self, file_path: Path, package_info: PackageInfo) -> None:
|
|
|
|
|
+ """Extract metadata from Cargo.toml file.
|
|
|
|
|
+
|
|
|
|
|
+ Args:
|
|
|
|
|
+ file_path: Path to the Cargo.toml file
|
|
|
|
|
+ package_info: PackageInfo object to update
|
|
|
|
|
+ """
|
|
|
|
|
+ try:
|
|
|
|
|
+ with open(file_path, 'r', encoding='utf-8') as f:
|
|
|
|
|
+ data = toml.load(f)
|
|
|
|
|
+
|
|
|
|
|
+ # Extract from [package] section
|
|
|
|
|
+ if "package" in data:
|
|
|
|
|
+ package = data["package"]
|
|
|
|
|
+
|
|
|
|
|
+ if not package_info.name and "name" in package:
|
|
|
|
|
+ package_info.name = package["name"]
|
|
|
|
|
+
|
|
|
|
|
+ if not package_info.version and "version" in package:
|
|
|
|
|
+ package_info.version = package["version"]
|
|
|
|
|
+
|
|
|
|
|
+ if not package_info.summary and "description" in package:
|
|
|
|
|
+ package_info.summary = package["description"]
|
|
|
|
|
+
|
|
|
|
|
+ if "authors" in package:
|
|
|
|
|
+ package_info.authors.extend(package["authors"])
|
|
|
|
|
+
|
|
|
|
|
+ if "license" in package:
|
|
|
|
|
+ license_name = package["license"]
|
|
|
|
|
+ category = self._map_license_category(license_name)
|
|
|
|
|
+ package_info.licenses.append(
|
|
|
|
|
+ License(name=license_name, text="LICENSE", category=category)
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ if "repository" in package:
|
|
|
|
|
+ package_info.url = package["repository"]
|
|
|
|
|
+
|
|
|
|
|
+ # Extract dependencies
|
|
|
|
|
+ if "dependencies" in data:
|
|
|
|
|
+ package_info.runtime_dependencies.extend(data["dependencies"].keys())
|
|
|
|
|
+
|
|
|
|
|
+ if "dev-dependencies" in data:
|
|
|
|
|
+ package_info.build_dependencies.extend(data["dev-dependencies"].keys())
|
|
|
|
|
+
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ logger.warning(f"Failed to extract metadata from {file_path}: {e}")
|
|
|
|
|
+
|
|
|
|
|
+ def _extract_from_composer_json(self, file_path: Path, package_info: PackageInfo) -> None:
|
|
|
|
|
+ """Extract metadata from composer.json file.
|
|
|
|
|
+
|
|
|
|
|
+ Args:
|
|
|
|
|
+ file_path: Path to the composer.json file
|
|
|
|
|
+ package_info: PackageInfo object to update
|
|
|
|
|
+ """
|
|
|
|
|
+ try:
|
|
|
|
|
+ with open(file_path, 'r', encoding='utf-8') as f:
|
|
|
|
|
+ data = json.load(f)
|
|
|
|
|
+
|
|
|
|
|
+ # Extract basic information
|
|
|
|
|
+ if not package_info.name and "name" in data:
|
|
|
|
|
+ package_info.name = data["name"]
|
|
|
|
|
+
|
|
|
|
|
+ if not package_info.version and "version" in data:
|
|
|
|
|
+ package_info.version = data["version"]
|
|
|
|
|
+
|
|
|
|
|
+ if not package_info.summary and "description" in data:
|
|
|
|
|
+ package_info.summary = data["description"]
|
|
|
|
|
+
|
|
|
|
|
+ if not package_info.url and "homepage" in data:
|
|
|
|
|
+ package_info.url = data["homepage"]
|
|
|
|
|
+
|
|
|
|
|
+ # Extract author information
|
|
|
|
|
+ if "authors" in data:
|
|
|
|
|
+ for author in data["authors"]:
|
|
|
|
|
+ if isinstance(author, dict) and "name" in author:
|
|
|
|
|
+ package_info.authors.append(author["name"])
|
|
|
|
|
+
|
|
|
|
|
+ # Extract license information
|
|
|
|
|
+ if "license" in data:
|
|
|
|
|
+ license_data = data["license"]
|
|
|
|
|
+ if isinstance(license_data, list):
|
|
|
|
|
+ for license_name in license_data:
|
|
|
|
|
+ category = self._map_license_category(license_name)
|
|
|
|
|
+ package_info.licenses.append(
|
|
|
|
|
+ License(name=license_name, text="LICENSE", category=category)
|
|
|
|
|
+ )
|
|
|
|
|
+ else:
|
|
|
|
|
+ category = self._map_license_category(license_data)
|
|
|
|
|
+ package_info.licenses.append(
|
|
|
|
|
+ License(name=license_data, text="LICENSE", category=category)
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ # Extract dependencies
|
|
|
|
|
+ if "require" in data:
|
|
|
|
|
+ package_info.runtime_dependencies.extend(data["require"].keys())
|
|
|
|
|
+
|
|
|
|
|
+ if "require-dev" in data:
|
|
|
|
|
+ package_info.build_dependencies.extend(data["require-dev"].keys())
|
|
|
|
|
+
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ logger.warning(f"Failed to extract metadata from {file_path}: {e}")
|
|
|
|
|
+
|
|
|
|
|
+ def _extract_from_pom_xml(self, file_path: Path, package_info: PackageInfo) -> None:
|
|
|
|
|
+ """Extract metadata from pom.xml file.
|
|
|
|
|
+
|
|
|
|
|
+ Args:
|
|
|
|
|
+ file_path: Path to the pom.xml file
|
|
|
|
|
+ package_info: PackageInfo object to update
|
|
|
|
|
+ """
|
|
|
|
|
+ # This is a simplified implementation
|
|
|
|
|
+ # In a real implementation, you would use an XML parser
|
|
|
|
|
+ try:
|
|
|
|
|
+ with open(file_path, 'r', encoding='utf-8') as f:
|
|
|
|
|
+ content = f.read()
|
|
|
|
|
+
|
|
|
|
|
+ # Use regex to extract basic information
|
|
|
|
|
+ # This is not robust but serves as a starting point
|
|
|
|
|
+ if not package_info.name:
|
|
|
|
|
+ name_match = re.search(r'<artifactId>(.*?)</artifactId>', content)
|
|
|
|
|
+ if name_match:
|
|
|
|
|
+ package_info.name = name_match.group(1)
|
|
|
|
|
+
|
|
|
|
|
+ if not package_info.version:
|
|
|
|
|
+ version_match = re.search(r'<version>(.*?)</version>', content)
|
|
|
|
|
+ if version_match:
|
|
|
|
|
+ package_info.version = version_match.group(1)
|
|
|
|
|
+
|
|
|
|
|
+ if not package_info.summary:
|
|
|
|
|
+ desc_match = re.search(r'<description>(.*?)</description>', content)
|
|
|
|
|
+ if desc_match:
|
|
|
|
|
+ package_info.summary = desc_match.group(1)
|
|
|
|
|
+
|
|
|
|
|
+ if not package_info.url:
|
|
|
|
|
+ url_match = re.search(r'<url>(.*?)</url>', content)
|
|
|
|
|
+ if url_match:
|
|
|
|
|
+ package_info.url = url_match.group(1)
|
|
|
|
|
+
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ logger.warning(f"Failed to extract metadata from {file_path}: {e}")
|
|
|
|
|
+
|
|
|
|
|
+ def _extract_from_build_gradle(self, file_path: Path, package_info: PackageInfo) -> None:
|
|
|
|
|
+ """Extract metadata from build.gradle file.
|
|
|
|
|
+
|
|
|
|
|
+ Args:
|
|
|
|
|
+ file_path: Path to the build.gradle file
|
|
|
|
|
+ package_info: PackageInfo object to update
|
|
|
|
|
+ """
|
|
|
|
|
+ # This is a simplified implementation
|
|
|
|
|
+ try:
|
|
|
|
|
+ with open(file_path, 'r', encoding='utf-8') as f:
|
|
|
|
|
+ content = f.read()
|
|
|
|
|
+
|
|
|
|
|
+ # Use regex to extract basic information
|
|
|
|
|
+ if not package_info.name:
|
|
|
|
|
+ name_match = re.search(r'archivesBaseName\s*=\s*["\']([^"\']+)["\']', content)
|
|
|
|
|
+ if name_match:
|
|
|
|
|
+ package_info.name = name_match.group(1)
|
|
|
|
|
+
|
|
|
|
|
+ if not package_info.version:
|
|
|
|
|
+ version_match = re.search(r'version\s*=\s*["\']([^"\']+)["\']', content)
|
|
|
|
|
+ if version_match:
|
|
|
|
|
+ package_info.version = version_match.group(1)
|
|
|
|
|
+
|
|
|
|
|
+ if not package_info.summary:
|
|
|
|
|
+ desc_match = re.search(r'description\s*=\s*["\']([^"\']+)["\']', content)
|
|
|
|
|
+ if desc_match:
|
|
|
|
|
+ package_info.summary = desc_match.group(1)
|
|
|
|
|
+
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ logger.warning(f"Failed to extract metadata from {file_path}: {e}")
|
|
|
|
|
+
|
|
|
|
|
+ def _extract_from_readme(self, source_dir: Path, package_info: PackageInfo) -> None:
|
|
|
|
|
+ """Extract information from README files.
|
|
|
|
|
+
|
|
|
|
|
+ Args:
|
|
|
|
|
+ source_dir: Path to the source directory
|
|
|
|
|
+ package_info: PackageInfo object to update
|
|
|
|
|
+ """
|
|
|
|
|
+ readme_files = [
|
|
|
|
|
+ "README", "README.md", "README.txt", "README.rst",
|
|
|
|
|
+ "readme", "readme.md", "readme.txt", "readme.rst"
|
|
|
|
|
+ ]
|
|
|
|
|
+
|
|
|
|
|
+ for readme_name in readme_files:
|
|
|
|
|
+ readme_path = source_dir / readme_name
|
|
|
|
|
+ if readme_path.exists():
|
|
|
|
|
+ try:
|
|
|
|
|
+ with open(readme_path, 'r', encoding='utf-8') as f:
|
|
|
|
|
+ content = f.read()
|
|
|
|
|
+
|
|
|
|
|
+ # Extract the first paragraph as description if not already set
|
|
|
|
|
+ if not package_info.description:
|
|
|
|
|
+ lines = content.split('\n')
|
|
|
|
|
+ description_lines = []
|
|
|
|
|
+ for line in lines:
|
|
|
|
|
+ line = line.strip()
|
|
|
|
|
+ if line and not line.startswith('#'):
|
|
|
|
|
+ description_lines.append(line)
|
|
|
|
|
+ elif description_lines:
|
|
|
|
|
+ break
|
|
|
|
|
+
|
|
|
|
|
+ if description_lines:
|
|
|
|
|
+ package_info.description = ' '.join(description_lines)
|
|
|
|
|
+
|
|
|
|
|
+ # Extract project URL if not already set
|
|
|
|
|
+ if not package_info.url:
|
|
|
|
|
+ # Look for common URL patterns
|
|
|
|
|
+ url_patterns = [
|
|
|
|
|
+ r'github\.com/([^\s/]+/[^\s/]+)',
|
|
|
|
|
+ r'gitlab\.com/([^\s/]+/[^\s/]+)',
|
|
|
|
|
+ r'https?://([^\s]+)'
|
|
|
|
|
+ ]
|
|
|
|
|
+
|
|
|
|
|
+ for pattern in url_patterns:
|
|
|
|
|
+ match = re.search(pattern, content)
|
|
|
|
|
+ if match:
|
|
|
|
|
+ if 'github' in pattern or 'gitlab' in pattern:
|
|
|
|
|
+ package_info.url = f"https://{match.group(0)}"
|
|
|
|
|
+ else:
|
|
|
|
|
+ package_info.url = match.group(0)
|
|
|
|
|
+ break
|
|
|
|
|
+
|
|
|
|
|
+ break # Use the first README file found
|
|
|
|
|
+
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ logger.warning(f"Failed to extract information from {readme_path}: {e}")
|
|
|
|
|
+
|
|
|
|
|
+ def _extract_from_license_files(self, source_dir: Path, package_info: PackageInfo) -> None:
|
|
|
|
|
+ """Extract information from license files.
|
|
|
|
|
+
|
|
|
|
|
+ Args:
|
|
|
|
|
+ source_dir: Path to the source directory
|
|
|
|
|
+ package_info: PackageInfo object to update
|
|
|
|
|
+ """
|
|
|
|
|
+ license_files = [
|
|
|
|
|
+ "LICENSE", "LICENSE.txt", "LICENSE.md",
|
|
|
|
|
+ "COPYING", "COPYING.txt",
|
|
|
|
|
+ "license", "license.txt", "license.md",
|
|
|
|
|
+ "copying", "copying.txt"
|
|
|
|
|
+ ]
|
|
|
|
|
+
|
|
|
|
|
+ for license_name in license_files:
|
|
|
|
|
+ license_path = source_dir / license_name
|
|
|
|
|
+ if license_path.exists():
|
|
|
|
|
+ try:
|
|
|
|
|
+ with open(license_path, 'r', encoding='utf-8') as f:
|
|
|
|
|
+ content = f.read()
|
|
|
|
|
+
|
|
|
|
|
+ # Try to identify the license type
|
|
|
|
|
+ license_type = self._identify_license_type(content)
|
|
|
|
|
+ if license_type:
|
|
|
|
|
+ category = self._map_license_category(license_type)
|
|
|
|
|
+
|
|
|
|
|
+ # Check if we already have this license
|
|
|
|
|
+ for license in package_info.licenses:
|
|
|
|
|
+ if license.name == license_type:
|
|
|
|
|
+ break
|
|
|
|
|
+ else:
|
|
|
|
|
+ package_info.licenses.append(
|
|
|
|
|
+ License(name=license_type, text=license_name, category=category)
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ break # Use the first license file found
|
|
|
|
|
+
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ logger.warning(f"Failed to extract information from {license_path}: {e}")
|
|
|
|
|
+
|
|
|
|
|
+ def _extract_from_git_info(self, source_dir: Path, package_info: PackageInfo) -> None:
|
|
|
|
|
+ """Extract information from git repository.
|
|
|
|
|
+
|
|
|
|
|
+ Args:
|
|
|
|
|
+ source_dir: Path to the source directory
|
|
|
|
|
+ package_info: PackageInfo object to update
|
|
|
|
|
+ """
|
|
|
|
|
+ git_dir = source_dir / '.git'
|
|
|
|
|
+ if not git_dir.exists():
|
|
|
|
|
+ return
|
|
|
|
|
+
|
|
|
|
|
+ try:
|
|
|
|
|
+ # Try to get remote URL
|
|
|
|
|
+ git_config = git_dir / 'config'
|
|
|
|
|
+ if git_config.exists():
|
|
|
|
|
+ with open(git_config, 'r', encoding='utf-8') as f:
|
|
|
|
|
+ content = f.read()
|
|
|
|
|
+
|
|
|
|
|
+ # Look for remote URL
|
|
|
|
|
+ url_match = re.search(r'url\s*=\s*(.+)', content)
|
|
|
|
|
+ if url_match and not package_info.url:
|
|
|
|
|
+ url = url_match.group(1).strip()
|
|
|
|
|
+ # Convert git@ to https:// if needed
|
|
|
|
|
+ if url.startswith('git@'):
|
|
|
|
|
+ url = url.replace(':', '/').replace('git@', 'https://')
|
|
|
|
|
+ package_info.url = url
|
|
|
|
|
+
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ logger.warning(f"Failed to extract git information: {e}")
|
|
|
|
|
+
|
|
|
|
|
+ def _extract_ast_value(self, node) -> Optional[str]:
|
|
|
|
|
+ """Extract a string value from an AST node.
|
|
|
|
|
+
|
|
|
|
|
+ Args:
|
|
|
|
|
+ node: AST node
|
|
|
|
|
+
|
|
|
|
|
+ Returns:
|
|
|
|
|
+ String value or None
|
|
|
|
|
+ """
|
|
|
|
|
+ if isinstance(node, ast.Constant) and isinstance(node.value, str):
|
|
|
|
|
+ return node.value
|
|
|
|
|
+ # For backward compatibility with older Python versions
|
|
|
|
|
+ elif hasattr(ast, 'Str') and isinstance(node, ast.Str):
|
|
|
|
|
+ return node.s
|
|
|
|
|
+ elif isinstance(node, ast.List):
|
|
|
|
|
+ items = []
|
|
|
|
|
+ for item in node.elts:
|
|
|
|
|
+ value = self._extract_ast_value(item)
|
|
|
|
|
+ if value:
|
|
|
|
|
+ items.append(value)
|
|
|
|
|
+ return items
|
|
|
|
|
+ return None
|
|
|
|
|
+
|
|
|
|
|
+ def _identify_license_type(self, content: str) -> Optional[str]:
|
|
|
|
|
+ """Identify the license type from license file content.
|
|
|
|
|
+
|
|
|
|
|
+ Args:
|
|
|
|
|
+ content: Content of the license file
|
|
|
|
|
+
|
|
|
|
|
+ Returns:
|
|
|
|
|
+ License type string or None
|
|
|
|
|
+ """
|
|
|
|
|
+ # Simple keyword-based license detection
|
|
|
|
|
+ content_lower = content.lower()
|
|
|
|
|
+
|
|
|
|
|
+ if "mit license" in content_lower or "permission is hereby granted" in content_lower:
|
|
|
|
|
+ return "MIT"
|
|
|
|
|
+ elif "apache license" in content_lower or "apache-2.0" in content_lower:
|
|
|
|
|
+ return "Apache-2.0"
|
|
|
|
|
+ elif "gnu general public license" in content_lower or "gpl" in content_lower:
|
|
|
|
|
+ if "version 3" in content_lower:
|
|
|
|
|
+ return "GPL-3.0"
|
|
|
|
|
+ elif "version 2" in content_lower:
|
|
|
|
|
+ return "GPL-2.0"
|
|
|
|
|
+ else:
|
|
|
|
|
+ return "GPL"
|
|
|
|
|
+ elif "gnu lesser general public license" in content_lower or "lgpl" in content_lower:
|
|
|
|
|
+ if "version 3" in content_lower:
|
|
|
|
|
+ return "LGPL-3.0"
|
|
|
|
|
+ elif "version 2" in content_lower:
|
|
|
|
|
+ return "LGPL-2.1"
|
|
|
|
|
+ else:
|
|
|
|
|
+ return "LGPL"
|
|
|
|
|
+ elif "bsd license" in content_lower:
|
|
|
|
|
+ if "3-clause" in content_lower:
|
|
|
|
|
+ return "BSD-3-Clause"
|
|
|
|
|
+ elif "2-clause" in content_lower:
|
|
|
|
|
+ return "BSD-2-Clause"
|
|
|
|
|
+ else:
|
|
|
|
|
+ return "BSD"
|
|
|
|
|
+ elif "mozilla public license" in content_lower or "mpl" in content_lower:
|
|
|
|
|
+ return "MPL-2.0"
|
|
|
|
|
+ elif "boost software license" in content_lower:
|
|
|
|
|
+ return "BSL-1.0"
|
|
|
|
|
+ elif "unlicense" in content_lower:
|
|
|
|
|
+ return "Unlicense"
|
|
|
|
|
+ elif "creative commons" in content_lower or "cc0" in content_lower:
|
|
|
|
|
+ return "CC0-1.0"
|
|
|
|
|
+
|
|
|
|
|
+ return None
|
|
|
|
|
+
|
|
|
|
|
+ def _map_license_category(self, license_name: str) -> LicenseCategory:
|
|
|
|
|
+ """Map a license name to a license category.
|
|
|
|
|
+
|
|
|
|
|
+ Args:
|
|
|
|
|
+ license_name: Name of the license
|
|
|
|
|
+
|
|
|
|
|
+ Returns:
|
|
|
|
|
+ LicenseCategory enum value
|
|
|
|
|
+ """
|
|
|
|
|
+ # Direct mapping
|
|
|
|
|
+ if license_name in self.license_mapping:
|
|
|
|
|
+ return self.license_mapping[license_name]
|
|
|
|
|
+
|
|
|
|
|
+ # Check if the license name contains known keywords
|
|
|
|
|
+ license_lower = license_name.lower()
|
|
|
|
|
+
|
|
|
|
|
+ if "mit" in license_lower or "apache" in license_lower or "bsd" in license_lower:
|
|
|
|
|
+ return LicenseCategory.LIBRE
|
|
|
|
|
+ elif "gpl" in license_lower or "lgpl" in license_lower or "agpl" in license_lower:
|
|
|
|
|
+ return LicenseCategory.LIBRE
|
|
|
|
|
+ elif "proprietary" in license_lower or "commercial" in license_lower:
|
|
|
|
|
+ return LicenseCategory.PROPRIETARY
|
|
|
|
|
+ else:
|
|
|
|
|
+ return LicenseCategory.SOURCE_AVAILABLE
|