|
|
@@ -11,6 +11,7 @@ import logging
|
|
|
import re
|
|
|
import ast
|
|
|
import toml
|
|
|
+import urllib.parse
|
|
|
from pathlib import Path
|
|
|
from typing import Dict, List, Optional, Any, Tuple
|
|
|
|
|
|
@@ -84,7 +85,7 @@ class MetadataExtractor:
|
|
|
"Commercial": LicenseCategory.PROPRIETARY
|
|
|
}
|
|
|
|
|
|
- def extract(self, source_dir: Path) -> PackageInfo:
|
|
|
+ def extract(self, source_dir: Path, url: Optional[str] = None) -> PackageInfo:
|
|
|
"""Extract metadata from a source directory.
|
|
|
|
|
|
Args:
|
|
|
@@ -129,6 +130,10 @@ class MetadataExtractor:
|
|
|
self._extract_from_license_files(source_dir, package_info)
|
|
|
self._extract_from_git_info(source_dir, package_info)
|
|
|
|
|
|
+ # If we still don't have a name or version, try to derive from URL/filename
|
|
|
+ if url and (not package_info.name or not package_info.version):
|
|
|
+ self._extract_from_filename(url, package_info)
|
|
|
+
|
|
|
# If we still don't have a name, try to derive from directory
|
|
|
if not package_info.name:
|
|
|
package_info.name = source_dir.name.lower().replace('-', '_').replace(' ', '_')
|
|
|
@@ -761,4 +766,173 @@ class MetadataExtractor:
|
|
|
elif "proprietary" in license_lower or "commercial" in license_lower:
|
|
|
return LicenseCategory.PROPRIETARY
|
|
|
else:
|
|
|
- return LicenseCategory.SOURCE_AVAILABLE
|
|
|
+ return LicenseCategory.SOURCE_AVAILABLE
|
|
|
+
|
|
|
+ def _extract_from_filename(self, url: str, package_info: PackageInfo) -> None:
|
|
|
+ """Extract package name and version from archive filename as fallback.
|
|
|
+
|
|
|
+ Args:
|
|
|
+ url: URL to the source archive
|
|
|
+ package_info: PackageInfo object to update
|
|
|
+ """
|
|
|
+ try:
|
|
|
+ # Extract filename from URL
|
|
|
+ parsed_url = urllib.parse.urlparse(url)
|
|
|
+ filename = os.path.basename(parsed_url.path)
|
|
|
+
|
|
|
+ if not filename:
|
|
|
+ logger.debug("Could not extract filename from URL")
|
|
|
+ return
|
|
|
+
|
|
|
+ logger.debug(f"Extracting metadata from filename: {filename}")
|
|
|
+
|
|
|
+ # Remove archive extensions
|
|
|
+ base_name = self._strip_archive_extensions(filename)
|
|
|
+
|
|
|
+ # Parse common patterns
|
|
|
+ name, version, confidence = self._parse_filename_pattern(base_name)
|
|
|
+
|
|
|
+ # Only use if we haven't already found name/version
|
|
|
+ if name and not package_info.name:
|
|
|
+ package_info.name = name
|
|
|
+ logger.debug(f"Extracted name from filename: {name} (confidence: {confidence})")
|
|
|
+
|
|
|
+ if version and not package_info.version:
|
|
|
+ package_info.version = version
|
|
|
+ logger.debug(f"Extracted version from filename: {version} (confidence: {confidence})")
|
|
|
+
|
|
|
+ # Always store confidence level for potential user confirmation
|
|
|
+ package_info.extra_data["filename_confidence"] = confidence
|
|
|
+ package_info.extra_data["filename_source"] = filename
|
|
|
+
|
|
|
+ except Exception as e:
|
|
|
+ logger.warning(f"Failed to extract metadata from filename: {e}")
|
|
|
+
|
|
|
+ def _strip_archive_extensions(self, filename: str) -> str:
|
|
|
+ """Remove archive extensions from filename.
|
|
|
+
|
|
|
+ Args:
|
|
|
+ filename: Original filename with extensions
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ Filename without archive extensions
|
|
|
+ """
|
|
|
+ # Common archive extensions to strip (longer ones first)
|
|
|
+ extensions = [
|
|
|
+ '.tar.gz', '.tar.bz2', '.tar.xz', '.tar.Z',
|
|
|
+ '.tgz', '.tbz2', '.txz',
|
|
|
+ '.tar', '.zip', '.rar', '.7z'
|
|
|
+ ]
|
|
|
+
|
|
|
+ base_name = filename.lower()
|
|
|
+ for ext in sorted(extensions, key=len, reverse=True):
|
|
|
+ if base_name.endswith(ext):
|
|
|
+ base_name = base_name[:-len(ext)]
|
|
|
+ break
|
|
|
+
|
|
|
+ return base_name
|
|
|
+
|
|
|
+ def _parse_filename_pattern(self, base_name: str) -> Tuple[Optional[str], Optional[str], float]:
|
|
|
+ """Parse package name and version from base filename.
|
|
|
+
|
|
|
+ Args:
|
|
|
+ base_name: Filename without archive extensions
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ Tuple of (name, version, confidence)
|
|
|
+ """
|
|
|
+ # Common patterns to try, in order of preference
|
|
|
+ patterns = [
|
|
|
+ # package-version (most common)
|
|
|
+ (r'^([a-zA-Z0-9._-]+)-([0-9][a-zA-Z0-9._-]*)$', 0.9),
|
|
|
+ # package_version
|
|
|
+ (r'^([a-zA-Z0-9._-]+)_([0-9][a-zA-Z0-9._-]*)$', 0.8),
|
|
|
+ # package.v.version
|
|
|
+ (r'^([a-zA-Z0-9._-]+)\.v\.([0-9][a-zA-Z0-9._-]*)$', 0.8),
|
|
|
+ # package.version
|
|
|
+ (r'^([a-zA-Z0-9._-]+)\.([0-9][a-zA-Z0-9._-]*)$', 0.7),
|
|
|
+ # package-vversion
|
|
|
+ (r'^([a-zA-Z0-9._-]+)-v([0-9][a-zA-Z0-9._-]*)$', 0.8),
|
|
|
+ # package_vversion
|
|
|
+ (r'^([a-zA-Z0-9._-]+)_v([0-9][a-zA-Z0-9._-]*)$', 0.7),
|
|
|
+ # package-version-suffix
|
|
|
+ (r'^([a-zA-Z0-9._-]+)-([0-9][a-zA-Z0-9._-]*)-[a-zA-Z0-9._-]+$', 0.6),
|
|
|
+ # package-version-release
|
|
|
+ (r'^([a-zA-Z0-9._-]+)-([0-9][a-zA-Z0-9._-]*)-([0-9]+)$', 0.6),
|
|
|
+ ]
|
|
|
+
|
|
|
+ for pattern, confidence in patterns:
|
|
|
+ match = re.match(pattern, base_name)
|
|
|
+ if match:
|
|
|
+ name = match.group(1).strip('._-')
|
|
|
+ version = match.group(2).strip('._-')
|
|
|
+
|
|
|
+ # Debug logging
|
|
|
+ logger.debug(f"Pattern matched: {pattern} with confidence {confidence}")
|
|
|
+ logger.debug(f"Raw name: {match.group(1)}, Raw version: {match.group(2)}")
|
|
|
+
|
|
|
+ # Clean up name (convert to lowercase, replace spaces with underscores)
|
|
|
+ name = re.sub(r'[^a-zA-Z0-9._-]', '', name)
|
|
|
+ name = re.sub(r'\s+', '_', name)
|
|
|
+
|
|
|
+ # Clean up version
|
|
|
+ version = re.sub(r'[^a-zA-Z0-9._-]', '', version)
|
|
|
+
|
|
|
+ # Debug logging after cleanup
|
|
|
+ logger.debug(f"Cleaned name: {name}, Cleaned version: {version}")
|
|
|
+
|
|
|
+ # Additional validation
|
|
|
+ if self._is_valid_name(name) and self._is_valid_version(version):
|
|
|
+ return name, version, confidence
|
|
|
+
|
|
|
+ # If no pattern matches, try to extract just the name (no version)
|
|
|
+ if self._is_valid_name(base_name):
|
|
|
+ name = re.sub(r'[^a-zA-Z0-9._-]', '', base_name)
|
|
|
+ name = re.sub(r'\s+', '_', name)
|
|
|
+ return name, None, 0.4
|
|
|
+
|
|
|
+ return None, None, 0.0
|
|
|
+
|
|
|
+ def _is_valid_name(self, name: str) -> bool:
|
|
|
+ """Check if a string looks like a valid package name.
|
|
|
+
|
|
|
+ Args:
|
|
|
+ name: String to validate
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ True if looks like a valid package name
|
|
|
+ """
|
|
|
+ if not name or len(name) < 2:
|
|
|
+ return False
|
|
|
+
|
|
|
+ # Should contain at least one letter
|
|
|
+ if not re.search(r'[a-zA-Z]', name):
|
|
|
+ return False
|
|
|
+
|
|
|
+ # Should not be just numbers and dots (likely not a package name)
|
|
|
+ if re.match(r'^[0-9.]+$', name):
|
|
|
+ return False
|
|
|
+
|
|
|
+ return True
|
|
|
+
|
|
|
+ def _is_valid_version(self, version: str) -> bool:
|
|
|
+ """Check if a string looks like a valid version.
|
|
|
+
|
|
|
+ Args:
|
|
|
+ version: String to validate
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ True if looks like a valid version
|
|
|
+ """
|
|
|
+ if not version or len(version) < 1:
|
|
|
+ return False
|
|
|
+
|
|
|
+ # Should start with a digit
|
|
|
+ if not version[0].isdigit():
|
|
|
+ return False
|
|
|
+
|
|
|
+ # Should contain at least one digit
|
|
|
+ if not re.search(r'[0-9]', version):
|
|
|
+ return False
|
|
|
+
|
|
|
+ return True
|