瀏覽代碼

Bug fixes

clanker 1 月之前
父節點
當前提交
0531f4b905
共有 3 個文件被更改,包括 195 次插入3 次删除
  1. 1 1
      src/autusm/cli.py
  2. 18 0
      src/autusm/interaction.py
  3. 176 2
      src/autusm/metadata.py

+ 1 - 1
src/autusm/cli.py

@@ -156,7 +156,7 @@ def main(
 
         # Step 4: Extract metadata
         logger.info("Extracting metadata...")
-        package_info = metadata_extractor.extract(source_dir)
+        package_info = metadata_extractor.extract(source_dir, url)
 
         # Override with command line arguments if provided
         if name:

+ 18 - 0
src/autusm/interaction.py

@@ -47,6 +47,24 @@ class UserInteraction:
         
         logger.info("Filling in missing package information")
         
+        # Check if we have filename-based detection with low confidence
+        if "filename_confidence" in package_info.extra_data:
+            confidence = package_info.extra_data["filename_confidence"]
+            source = package_info.extra_data["filename_source"]
+            
+            if confidence < 0.8:
+                print(f"\nNote: Package name and version were extracted from filename '{source}'")
+                print(f"Confidence: {confidence:.0%} (lower confidence may indicate inaccurate detection)")
+                
+                if not self.confirm_action("Use these extracted values?"):
+                    # User wants to override
+                    if package_info.name:
+                        package_info.name = self._ask_string("Package name", required=True, default=package_info.name)
+                        package_info.name = self._sanitize_package_name(package_info.name)
+                    if package_info.version:
+                        package_info.version = self._ask_string("Package version", required=True, default=package_info.version)
+                        package_info.version = self._sanitize_version(package_info.version)
+        
         # Fill in basic information
         if not package_info.name:
             package_info.name = self._ask_string("Package name", required=True)

+ 176 - 2
src/autusm/metadata.py

@@ -11,6 +11,7 @@ import logging
 import re
 import ast
 import toml
+import urllib.parse
 from pathlib import Path
 from typing import Dict, List, Optional, Any, Tuple
 
@@ -84,7 +85,7 @@ class MetadataExtractor:
             "Commercial": LicenseCategory.PROPRIETARY
         }
 
-    def extract(self, source_dir: Path) -> PackageInfo:
+    def extract(self, source_dir: Path, url: Optional[str] = None) -> PackageInfo:
         """Extract metadata from a source directory.
         
         Args:
@@ -129,6 +130,10 @@ class MetadataExtractor:
             self._extract_from_license_files(source_dir, package_info)
             self._extract_from_git_info(source_dir, package_info)
             
+            # If we still don't have a name or version, try to derive from URL/filename
+            if url and (not package_info.name or not package_info.version):
+                self._extract_from_filename(url, package_info)
+            
             # If we still don't have a name, try to derive from directory
             if not package_info.name:
                 package_info.name = source_dir.name.lower().replace('-', '_').replace(' ', '_')
@@ -761,4 +766,173 @@ class MetadataExtractor:
         elif "proprietary" in license_lower or "commercial" in license_lower:
             return LicenseCategory.PROPRIETARY
         else:
-            return LicenseCategory.SOURCE_AVAILABLE
+            return LicenseCategory.SOURCE_AVAILABLE
+
+    def _extract_from_filename(self, url: str, package_info: PackageInfo) -> None:
+        """Extract package name and version from archive filename as fallback.
+        
+        Args:
+            url: URL to the source archive
+            package_info: PackageInfo object to update
+        """
+        try:
+            # Extract filename from URL
+            parsed_url = urllib.parse.urlparse(url)
+            filename = os.path.basename(parsed_url.path)
+            
+            if not filename:
+                logger.debug("Could not extract filename from URL")
+                return
+                
+            logger.debug(f"Extracting metadata from filename: {filename}")
+            
+            # Remove archive extensions
+            base_name = self._strip_archive_extensions(filename)
+            
+            # Parse common patterns
+            name, version, confidence = self._parse_filename_pattern(base_name)
+            
+            # Only use if we haven't already found name/version
+            if name and not package_info.name:
+                package_info.name = name
+                logger.debug(f"Extracted name from filename: {name} (confidence: {confidence})")
+                
+            if version and not package_info.version:
+                package_info.version = version
+                logger.debug(f"Extracted version from filename: {version} (confidence: {confidence})")
+                
+            # Always store confidence level for potential user confirmation
+            package_info.extra_data["filename_confidence"] = confidence
+            package_info.extra_data["filename_source"] = filename
+                
+        except Exception as e:
+            logger.warning(f"Failed to extract metadata from filename: {e}")
+
+    def _strip_archive_extensions(self, filename: str) -> str:
+        """Remove archive extensions from filename.
+        
+        Args:
+            filename: Original filename with extensions
+            
+        Returns:
+            Filename without archive extensions
+        """
+        # Common archive extensions to strip (longer ones first)
+        extensions = [
+            '.tar.gz', '.tar.bz2', '.tar.xz', '.tar.Z',
+            '.tgz', '.tbz2', '.txz',
+            '.tar', '.zip', '.rar', '.7z'
+        ]
+        
+        base_name = filename.lower()
+        for ext in sorted(extensions, key=len, reverse=True):
+            if base_name.endswith(ext):
+                base_name = base_name[:-len(ext)]
+                break
+                
+        return base_name
+
+    def _parse_filename_pattern(self, base_name: str) -> Tuple[Optional[str], Optional[str], float]:
+        """Parse package name and version from base filename.
+        
+        Args:
+            base_name: Filename without archive extensions
+            
+        Returns:
+            Tuple of (name, version, confidence)
+        """
+        # Common patterns to try, in order of preference
+        patterns = [
+            # package-version (most common)
+            (r'^([a-zA-Z0-9._-]+)-([0-9][a-zA-Z0-9._-]*)$', 0.9),
+            # package_version
+            (r'^([a-zA-Z0-9._-]+)_([0-9][a-zA-Z0-9._-]*)$', 0.8),
+            # package.v.version
+            (r'^([a-zA-Z0-9._-]+)\.v\.([0-9][a-zA-Z0-9._-]*)$', 0.8),
+            # package.version
+            (r'^([a-zA-Z0-9._-]+)\.([0-9][a-zA-Z0-9._-]*)$', 0.7),
+            # package-vversion
+            (r'^([a-zA-Z0-9._-]+)-v([0-9][a-zA-Z0-9._-]*)$', 0.8),
+            # package_vversion
+            (r'^([a-zA-Z0-9._-]+)_v([0-9][a-zA-Z0-9._-]*)$', 0.7),
+            # package-version-suffix
+            (r'^([a-zA-Z0-9._-]+)-([0-9][a-zA-Z0-9._-]*)-[a-zA-Z0-9._-]+$', 0.6),
+            # package-version-release
+            (r'^([a-zA-Z0-9._-]+)-([0-9][a-zA-Z0-9._-]*)-([0-9]+)$', 0.6),
+        ]
+        
+        for pattern, confidence in patterns:
+            match = re.match(pattern, base_name)
+            if match:
+                name = match.group(1).strip('._-')
+                version = match.group(2).strip('._-')
+                
+                # Debug logging
+                logger.debug(f"Pattern matched: {pattern} with confidence {confidence}")
+                logger.debug(f"Raw name: {match.group(1)}, Raw version: {match.group(2)}")
+                
+                # Clean up name (convert to lowercase, replace spaces with underscores)
+                name = re.sub(r'[^a-zA-Z0-9._-]', '', name)
+                name = re.sub(r'\s+', '_', name)
+                
+                # Clean up version
+                version = re.sub(r'[^a-zA-Z0-9._-]', '', version)
+                
+                # Debug logging after cleanup
+                logger.debug(f"Cleaned name: {name}, Cleaned version: {version}")
+                
+                # Additional validation
+                if self._is_valid_name(name) and self._is_valid_version(version):
+                    return name, version, confidence
+        
+        # If no pattern matches, try to extract just the name (no version)
+        if self._is_valid_name(base_name):
+            name = re.sub(r'[^a-zA-Z0-9._-]', '', base_name)
+            name = re.sub(r'\s+', '_', name)
+            return name, None, 0.4
+            
+        return None, None, 0.0
+
+    def _is_valid_name(self, name: str) -> bool:
+        """Check if a string looks like a valid package name.
+        
+        Args:
+            name: String to validate
+            
+        Returns:
+            True if looks like a valid package name
+        """
+        if not name or len(name) < 2:
+            return False
+            
+        # Should contain at least one letter
+        if not re.search(r'[a-zA-Z]', name):
+            return False
+            
+        # Should not be just numbers and dots (likely not a package name)
+        if re.match(r'^[0-9.]+$', name):
+            return False
+            
+        return True
+
+    def _is_valid_version(self, version: str) -> bool:
+        """Check if a string looks like a valid version.
+        
+        Args:
+            version: String to validate
+            
+        Returns:
+            True if looks like a valid version
+        """
+        if not version or len(version) < 1:
+            return False
+            
+        # Should start with a digit
+        if not version[0].isdigit():
+            return False
+            
+        # Should contain at least one digit
+        if not re.search(r'[0-9]', version):
+            return False
+            
+        return True