# -*- coding: utf-8 -*-
# ex: set sts=4 ts=4 sw=4 noet:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the reproman package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
"""Analyze existing spec or session file system to gather more detailed information
"""
from os.path import normpath
import sys
import time
from reproman.resource.session import get_local_session
from reproman.resource.session import Session
from .common_opts import resref_opt
from .common_opts import resref_type_opt
from .base import Interface
from ..support.constraints import EnsureNone
from ..support.constraints import EnsureStr
from ..support.exceptions import InsufficientArgumentsError
from ..support.param import Parameter
from ..utils import assure_list
from ..utils import pycache_source
from ..utils import to_unicode
from ..resource import get_manager
__docformat__ = 'restructuredtext'
from logging import getLogger
lgr = getLogger('reproman.api.retrace')
class Retrace(Interface):
"""Gather detailed package information from paths or a ReproZip trace file.
Examples
--------
$ reproman retrace --spec reprozip_run.yml > reproman_config.yml
"""
_params_ = dict(
spec=Parameter(
args=("--spec",),
doc="ReproZip YML file to be analyzed",
metavar='SPEC',
# nargs="+",
constraints=EnsureStr() | EnsureNone(),
),
path=Parameter(
args=("path",),
metavar="PATH",
doc="""path(s) to be traced. If spec is provided, would trace them
after tracing the spec""",
nargs="*",
constraints=EnsureStr() | EnsureNone()),
output_file=Parameter(
args=("-o", "--output-file",),
doc="Output file. If not specified - printed to stdout",
metavar='output_file',
constraints=EnsureStr() | EnsureNone(),
),
resref=Parameter(
args=("-r", "--resource",),
dest="resref",
metavar="RESOURCE",
doc="""Name or ID of the resource to operate on. To see available
resources, run 'reproman ls'.[PY: Note: As a special case, a session
instance can be passed as the value for `resref`. PY]""",
constraints=EnsureStr() | EnsureNone()),
resref_type=resref_type_opt,
)
# TODO: add a session/resource so we could trace within
# arbitrary sessions
@staticmethod
def __call__(path=None, spec=None, output_file=None,
resref=None, resref_type="auto"):
# heavy import -- should be delayed until actually used
if not (spec or path):
raise InsufficientArgumentsError(
"Need at least a single --spec or a file"
)
paths = assure_list(path)
if spec:
lgr.info("reading spec file %s", spec)
# TODO: generic loader to auto-detect formats etc
from reproman.formats.reprozip import ReprozipProvenance
spec = ReprozipProvenance(spec)
paths += spec.get_files() or []
# Convert paths to unicode
paths = map(to_unicode, paths)
# If .pyc files come in (common for ReprozipProvenance), the tracers
# don't recognize them.
paths = (pycache_source(p) or p for p in paths)
# The tracers assume normalized paths.
paths = list(map(normpath, paths))
if isinstance(resref, Session):
# TODO: Special case for Python callers. Is this something we want
# to handle more generally at the interface level?
session = resref
elif resref:
resource = get_manager().get_resource(resref, resref_type)
session = resource.get_session()
else:
session = get_local_session()
# TODO: at the moment assumes just a single distribution etc.
# Generalize
# TODO: RF so that only the above portion is reprozip specific.
# If we are to reuse their layout largely -- the rest should stay as is
(distributions, files) = identify_distributions(
paths,
session=session
)
from reproman.distributions.base import EnvironmentSpec
spec = EnvironmentSpec(
distributions=distributions,
)
if files:
spec.files = sorted(files)
# TODO: generic writer!
from reproman.formats.reproman import RepromanProvenance
stream = open(output_file, "w") if output_file else sys.stdout
RepromanProvenance.write(stream, spec)
if stream is not sys.stdout:
stream.close()
return distributions, files
# TODO: session should be with a state. Idea is that if we want
# to trace while inheriting all custom PATHs which that run might have
# had
def identify_distributions(files, session=None, tracer_classes=None):
"""Identify packages files belong to
Parameters
----------
files : iterable
Files to consider
Returns
-------
distributions : list of Distribution
unknown_files : list of str
Files which were not determined to belong to any specific distribution
"""
if tracer_classes is None:
tracer_classes = get_tracer_classes()
session = session or get_local_session()
# TODO create list of appropriate for the `environment` OS tracers
# in case of no environment -- get current one
# TODO: should operate in the session, might be given additional information
# not just files
# .identify_ functions will have a side-effect of shrinking this list in-place
# as they identify files beloning to them
files_to_consider = set(files)
distibutions = []
files_processed = set()
files_to_trace = files_to_consider
niter = 0
max_niter = 10
while True:
niter += 1
nfiles_processed = len(files_processed)
nfiles_to_trace = len(files_to_trace)
lgr.info("Entering iteration #%d over Tracers", niter)
if niter > max_niter:
lgr.error(
"We did %s iterations already, something is not right"
% max_niter)
break
for Tracer in tracer_classes:
lgr.debug("Tracing using %s", Tracer.__name__)
# TODO: memoize across all loops
# Identify directories from the files_to_consider
dirs = set(filter(session.isdir, files_to_trace))
# Pull out directories if the tracer can't handle them
if Tracer.HANDLES_DIRS:
files_to_trace = files_to_consider
files_skipped = set()
else:
files_to_trace = files_to_consider - dirs
files_skipped = files_to_consider - files_to_trace
tracer = Tracer(session=session)
begin = time.time()
# yoh things the idea was that tracer might trace even without
# files, so we should not just 'continue' the loop if there is no
# files_to_trace
if files_to_trace:
remaining_files_to_trace = files_to_trace
nenvs = 0
for env, remaining_files_to_trace in tracer.identify_distributions(
files_to_trace):
distibutions.append(env)
nenvs += 1
files_processed |= files_to_trace - remaining_files_to_trace
files_to_trace = remaining_files_to_trace
lgr.info("%s: %d envs with %d other files remaining",
Tracer.__name__,
nenvs,
len(files_to_trace))
# Re-combine any files that were skipped
files_to_consider = files_to_trace | files_skipped
lgr.debug("Assigning files to packages by %s took %f seconds",
tracer, time.time() - begin)
if len(files_to_trace) == 0 or (
nfiles_processed == len(files_processed) and
nfiles_to_trace == len(files_to_trace)):
lgr.info("No more changes or files to track. Exiting the loop")
break
return distibutions, files_to_consider
def get_tracer_classes():
"""A helper which returns a list of all available Tracers
The order should not but does matter and ATM is magically provided
"""
# TODO: automate discovery of available tracers
from reproman.distributions.debian import DebTracer
from reproman.distributions.redhat import RPMTracer
from reproman.distributions.conda import CondaTracer
from reproman.distributions.venv import VenvTracer
from reproman.distributions.vcs import VCSTracer
from reproman.distributions.docker import DockerTracer
from reproman.distributions.singularity import SingularityTracer
Tracers = [DebTracer, RPMTracer, CondaTracer, VenvTracer, VCSTracer,
DockerTracer, SingularityTracer]
return Tracers