2021-06-24 07:36:28 +02:00
import re
import os
import argparse
import subprocess
from datetime import datetime
import requests
2021-06-24 11:35:02 +02:00
from utils import load_packages_from_requirements , get_mapping_files_from_pipreqs , user_response_multi_choices
from utils import get_date_last_modified_python_file , get_python_filename_at_root
2021-06-24 08:33:54 +02:00
2021-06-24 07:36:28 +02:00
# TODO : Propose choice between date of first import or Added in requirements
2021-06-24 11:35:02 +02:00
# TODO : Other choices : When project was created, last commit (That wasnt on md file) get_date_last_modified_python_file()
2021-06-24 07:36:28 +02:00
# TODO : Pin also the dependencies tree of the packages Ex : Torch package might install numpy, etc
# TODO : Poetry mode ?
2021-06-24 08:33:54 +02:00
# TODO : Add a mode where file/folder creation/last update is used if no git repo ?
2021-06-24 07:36:28 +02:00
2021-06-24 11:13:01 +02:00
# TODO : Add jupyter notebook support
# TODO : Hide logging argument
# TODO : Add switch to keep unused imports
# FIXME : Some unused imports might be important (Pillow for example)
2021-06-24 08:32:38 +02:00
EXTRACT_DATE_REGEX = re . compile ( r ' Date: \ s*( \ d+) ' )
2021-06-24 07:36:28 +02:00
LETTER_REGEX = re . compile ( r ' [a-zA-Z] ' )
parser = argparse . ArgumentParser ( " Python Requirements Version Guesser " )
2021-06-24 11:13:01 +02:00
parser . add_argument ( ' --git_repo_path ' , type = str , default = None , required = False ) # TODO : CHDIR in this directory if provided
2021-06-24 07:36:28 +02:00
def get_pypi_history ( package_name , ignore_release_candidat = True ) :
2021-06-24 08:33:54 +02:00
"""
Retrieve version release dates via Pypi JSON api
"""
2021-06-24 11:13:01 +02:00
resp = requests . get ( f " https://pypi.org/pypi/ { package_name } /json " )
2021-06-24 07:36:28 +02:00
2021-06-24 11:13:01 +02:00
if resp . status_code != 200 :
print ( f " [INFO] Couldn ' t find package ' { package_name } on Pypi. Ignoring " )
return None
resp = resp . json ( )
2021-06-24 07:36:28 +02:00
2021-06-24 08:33:54 +02:00
versions = [ ]
for version , release_info_per_os in resp [ ' releases ' ] . items ( ) :
# Just taking the first platform upload date for now..
# Is it really different for other platforms ? Need to validate
# TODO : Give appropriate version based on os and python Versions resp['info']['requires_dist'] # ['require_python']
if len ( release_info_per_os ) == 0 :
continue
2021-06-24 07:36:28 +02:00
2021-06-24 08:33:54 +02:00
if ignore_release_candidat and LETTER_REGEX . search ( version ) :
continue
2021-06-24 07:36:28 +02:00
2021-06-24 08:33:54 +02:00
release_info = release_info_per_os [ 0 ]
release_date = datetime . strptime ( release_info [ ' upload_time ' ] . split ( " T " ) [ 0 ] , ' % Y- % m- %d ' )
versions . append ( ( version , release_date ) )
2021-06-24 07:36:28 +02:00
2021-06-24 08:33:54 +02:00
# FIXME : Do we really need to sort ? Versions should already be sorted
return sorted ( versions , key = lambda x : x [ 1 ] , reverse = True )
2021-06-24 07:36:28 +02:00
2021-06-24 08:32:38 +02:00
def find_version_at_date ( available_versions , date ) :
2021-06-24 08:33:54 +02:00
last_version = available_versions [ 0 ] [ 0 ]
2021-06-24 07:36:28 +02:00
2021-06-24 08:33:54 +02:00
# FIXME : Do binary search
for candidate_version , candidate_date in available_versions :
if date > = candidate_date :
return candidate_version
else :
last_version = candidate_version
2021-06-24 07:36:28 +02:00
2021-06-24 08:33:54 +02:00
# Date is older than available versions... Fallback on the oldest available version
return last_version
2021-06-24 07:36:28 +02:00
2021-06-24 11:13:01 +02:00
def get_all_imports ( stdlib_list = None ) :
cmd = f ' grep -PRoh --include= " *.py " " (?<=^import ) \\ w*|(?<=^from ) \\ w* " . | sort | uniq '
try :
grep_out = subprocess . check_output ( cmd , shell = True ) . decode ( ) . strip ( )
except :
grep_out = " "
if len ( grep_out ) == 0 :
raise Exception ( f " [ERROR] couldn ' t find any import statement " )
imports = [ l . strip ( ) for l in grep_out . split ( " \n " ) ]
if stdlib_list :
return [ l for l in imports if l not in stdlib_list ]
return imports
2021-06-24 21:16:33 +02:00
def get_date_when_package_committed ( package_name , via_requirements = False , latest_addition = False ) :
2021-06-24 08:33:54 +02:00
if not via_requirements :
2021-06-24 11:13:01 +02:00
search_pattern = f " ^import { package_name } |^from { package_name } "
2021-06-24 08:33:54 +02:00
filename = " "
else :
search_pattern = f " { package_name } $ "
filename = " requirements.txt "
# We grep for 'date' | '+ search pattern' so that we keep only commits that insert lines (+)
2021-06-24 11:13:01 +02:00
cmd = f " git log -G ' { search_pattern } ' --date unix -p { filename } | grep -i ' ^date: \\ | \\ +.* { package_name } ' "
2021-06-24 08:33:54 +02:00
try :
blame_out = subprocess . check_output ( cmd , shell = True ) . decode ( ) . strip ( )
except :
blame_out = " "
if len ( blame_out ) == 0 :
#return []
2021-06-24 11:13:01 +02:00
if not via_requirements :
msg = f " ' { package_name } ' is defined in requirements.txt but not used, ignoring "
else :
msg = f " ' { package_name } ' was not found in requirements.txt "
f " [INFO] { msg } "
return None
2021-06-24 08:33:54 +02:00
# Remove commit that are not directly followed by '+ import' (We grepped for this in cmd)
# This is ugly.. TODO: figure out a better way in the grep command
dates = [ ]
got_plus = False
for line in blame_out . split ( ' \n ' ) [ : : - 1 ] :
if line [ 0 ] == " + " :
got_plus = True
elif got_plus :
got_plus = False
matches = EXTRACT_DATE_REGEX . search ( line )
if matches :
dates . append ( datetime . fromtimestamp ( int ( matches . group ( 1 ) ) ) )
else :
raise Exception ( " [ERROR] while parsing git-log " )
# Get first date where the line was added
2021-06-24 11:13:01 +02:00
return sorted ( dates , reverse = not latest_addition ) [ 0 ]
2021-06-24 07:36:28 +02:00
if __name__ == " __main__ " :
2021-06-24 11:13:01 +02:00
print ( " = " * 60 )
print ( " Python requirements guesser " )
print ( " = " * 60 )
2021-06-24 07:36:28 +02:00
2021-06-24 11:13:01 +02:00
print ( " \n Follow the steps to guess package versions based on when they were added to git \n " )
2021-06-24 07:36:28 +02:00
2021-06-24 11:13:01 +02:00
stdlib_list , from_import_to_package_mapping , from_package_to_import_mapping = get_mapping_files_from_pipreqs ( )
local_packages = get_python_filename_at_root ( )
# Remove local_packages from the list of imports
stdlib_list . update ( local_packages )
all_imported_packages = set ( get_all_imports ( stdlib_list ) )
packages_in_requirements_version_map = load_packages_from_requirements ( ' requirements.txt ' )
packages_in_requirements = set ( packages_in_requirements_version_map . keys ( ) )
extra_packages = all_imported_packages - packages_in_requirements
all_packages = packages_in_requirements_version_map
for extra_package in extra_packages :
all_packages [ extra_package ] = None
packages = [ ]
for package_name , version in all_packages . items ( ) :
2021-06-24 08:33:54 +02:00
if version is None :
2021-06-24 11:35:02 +02:00
# TODO : Add argument to select one of the options by default
2021-06-24 11:13:01 +02:00
skip_choice = False
import_name = package_name
if import_name in from_package_to_import_mapping :
import_name = from_package_to_import_mapping [ import_name ]
if package_name in from_import_to_package_mapping :
package_name = from_import_to_package_mapping [ package_name ]
2021-06-24 08:33:54 +02:00
available_versions = get_pypi_history ( package_name , ignore_release_candidat = True )
2021-06-24 11:13:01 +02:00
if available_versions is None :
continue
2021-06-24 21:16:33 +02:00
date_added_via_import = get_date_when_package_committed ( import_name , via_requirements = False )
2021-06-24 11:13:01 +02:00
if date_added_via_import is None :
2021-06-24 11:35:02 +02:00
print ( f " [INFO] Package ' { package_name } ' is defined in requirements.txt but not used (Or comitted), ignoring " )
2021-06-24 11:13:01 +02:00
continue
date_added_via_import_str = date_added_via_import . strftime ( " % Y- % m- %d " )
import_version = find_version_at_date ( available_versions , date_added_via_import )
if package_name in packages_in_requirements :
2021-06-24 21:16:33 +02:00
date_added_via_req = get_date_when_package_committed ( package_name , via_requirements = True )
2021-06-24 11:13:01 +02:00
if date_added_via_req is not None :
req_version = find_version_at_date ( available_versions , date_added_via_req )
date_added_via_req_str = date_added_via_req . strftime ( " % Y- % m- %d " )
else :
choice = 1
skip_choice = True
print ( f " [INFO] Package ' { package_name } ' was not in requirements.txt, using date of first import (Version { import_version } / { date_added_via_import_str } ) " )
if not skip_choice and req_version == import_version :
print ( f " [INFO] Package ' { package_name } ' was attributed version { req_version } " )
skip_choice = True
choice = 1
if not skip_choice :
choice = user_response_multi_choices ( f " Choose guessing strategy for package ' { package_name } ' " , [
f ' { " First time the package was imported " . ljust ( 50 ) } (Version { import_version } / { date_added_via_import_str } ) ' ,
f ' { " When the package was added to requirements.txt " . ljust ( 50 ) } (Version { req_version } / { date_added_via_req_str } ) '
] )
if choice == 2 :
version = req_version
else :
version = import_version
else :
2021-06-24 11:35:02 +02:00
print ( f " [INFO] Package ' { package_name } ' was not found in requirements.txt, using date of first import (Version { import_version } / { date_added_via_import_str } ) " )
2021-06-24 11:13:01 +02:00
version = import_version
2021-06-24 07:36:28 +02:00
2021-06-24 08:33:54 +02:00
else :
2021-06-24 11:35:02 +02:00
print ( f " [INFO] Package ' { package_name } ' version is specified in requirements.txt ( { version } ) " )
2021-06-24 07:36:28 +02:00
2021-06-24 08:33:54 +02:00
packages . append ( ( package_name , version ) )
2021-06-24 07:36:28 +02:00
2021-06-24 11:13:01 +02:00
print ( " " )
2021-06-24 08:33:54 +02:00
# TODO : Write to requirements.txt
for package_name , version in sorted ( packages , key = lambda x : x [ 0 ] ) :
print ( f " { package_name } == { version } " )
2021-06-24 07:36:28 +02:00