Source code for dp_mobility_report.dpmreport

import os
import warnings
from pathlib import Path
from shutil import rmtree
from typing import List, Optional, Union

import numpy as np
import pandas as pd
from geopandas import GeoDataFrame
from pandarallel import pandarallel
from pandas import DataFrame
from tqdm.auto import tqdm

from dp_mobility_report import constants as const
from dp_mobility_report.model import preprocessing
from dp_mobility_report.report import report
from dp_mobility_report.report.html.templates import (
    create_html_assets,
    create_maps_folder,
    render_html,
)


[docs]class DpMobilityReport:
    """Generate a (differentially private) mobility report from a mobility dataset. The report will be generated as an HTML file, using the ``.to_file()`` method.

    Args:
        df: ``DataFrame`` containing the mobility data. Expected columns: User ID ``uid``, trip ID ``tid``, timestamp ``datetime`` (or ``int`` to indicate sequence position, if dataset only consists of sequences without timestamps), latitude ``lat`` and longitude ``lng`` in CRS EPSG:4326.
        tessellation: Geopandas ``GeoDataFrame`` containing the tessellation for spatial aggregations. Expected columns: ``tile_id``. If tessellation is not provided in the expected default CRS EPSG:4326, it will automatically be transformed. If no tessellation is provided, all analyses based on the tessellation will automatically be removed.
        privacy_budget: privacy_budget for the differentially private report. Defaults to ``None``, i.e., no privacy guarantee is provided.
        user_privacy: Whether item-level or user-level privacy is applied. Defaults to ``True`` (user-level privacy).
        max_trips_per_user: Maximum number of trips a user is allowed to contribute to the data. Dataset will be sampled accordingly. Defaults to ``None``, i.e., all trips are used. This implies that the actual maximum number of trips per user will be used according to the data, though this violates user-level Differential Privacy.
        analysis_selection: Select only needed analyses. A selection reduces computation time and leaves more privacy budget for higher accuracy of other analyses. ``analysis_selection`` takes a list of all analyses to be included. Alternatively, a list of analyses to be excluded can be set with ``analysis_exclusion``. Either entire segments can be included: ``const.OVERVIEW``, ``const.PLACE_ANALYSIS``, ``const.OD_ANALYSIS``, ``const.USER_ANALYSIS`` or any single analysis can be included: ``const.DS_STATISTICS``, ``const.MISSING_VALUES``, ``const.TRIPS_OVER_TIME``, ``const.TRIPS_PER_WEEKDAY``, ``const.TRIPS_PER_HOUR``, ``const.VISITS_PER_TILE``, ``const.VISITS_PER_TIME_TILE``, ``const.OD_FLOWS``, ``const.TRAVEL_TIME``, ``const.JUMP_LENGTH``, ``const.TRIPS_PER_USER``, ``const.USER_TIME_DELTA``, ``const.RADIUS_OF_GYRATION``, ``const.USER_TILE_COUNT``, ``const.MOBILITY_ENTROPY``. Default is None, i.e., all analyses are included.
        analysis_exclusion: Ignored, if ``analysis_selection`` is set! ``analysis_exclusion`` takes a list of all analyses to be excluded. Either entire segments can be excluded: ``const.OVERVIEW``, ``const.PLACE_ANALYSIS``, ``const.OD_ANALYSIS``, ``const.USER_ANALYSIS`` or any single analysis can be excluded: ``const.DS_STATISTICS``, ``const.MISSING_VALUES``, ``const.TRIPS_OVER_TIME``, ``const.TRIPS_PER_WEEKDAY``, ``const.TRIPS_PER_HOUR``, ``const.VISITS_PER_TILE``, ``const.VISITS_PER_TIME_TILE``, ``const.OD_FLOWS``, ``const.TRAVEL_TIME``, ``const.JUMP_LENGTH``, ``const.TRIPS_PER_USER``, ``const.USER_TIME_DELTA``, ``const.RADIUS_OF_GYRATION``, ``const.USER_TILE_COUNT``, ``const.MOBILITY_ENTROPY``
        budget_split: ``dict`` to customize how much privacy budget is assigned to which analysis. Each key needs to be named according to an analysis and the value needs to be an integer indicating the weight for the privacy budget. If no weight is assigned, a default weight of 1 is set. For example, if ``budget_split = {const.VISITS_PER_TILE: 10}``, then the privacy budget for ``visits_per_tile`` is 10 times higher than for every other analysis, which all get a default weight of 1. Possible ``dict`` keys (all analyses): ``const.DS_STATISTICS``, ``const.MISSING_VALUES``, ``const.TRIPS_OVER_TIME``, ``const.TRIPS_PER_WEEKDAY``, ``const.TRIPS_PER_HOUR``, ``const.VISITS_PER_TILE``, ``const.VISITS_PER_TIME_TILE``, ``const.OD_FLOWS``, ``const.TRAVEL_TIME``, ``const.JUMP_LENGTH``, ``const.TRIPS_PER_USER``, ``const.USER_TIME_DELTA``, ``const.RADIUS_OF_GYRATION``, ``const.USER_TILE_COUNT``, ``const.MOBILITY_ENTROPY``
        timewindows: List of hours as ``int`` that define the timewindows for the spatial analysis for single time windows. Defaults to ``[2, 6, 10, 14, 18, 22]``.
        max_travel_time: Upper bound for travel time histogram. If ``None`` is given, no upper bound is set. Defaults to ``None``.
        bin_range_travel_time: The range a single histogram bin spans for travel time (e.g., 5 for 5 min bins). If ``None`` is given, the histogram bins will be determined automatically. Defaults to ``None``.
        max_jump_length: Upper bound for jump length histogram. If ``None`` is given, no upper bound is set. Defaults to ``None``.
        bin_range_jump_length: The range a single histogram bin spans for jump length (e.g., 1 for 1 km bins). If ``None`` is given, the histogram bins will be determined automatically. Defaults to ``None``.
        max_radius_of_gyration: Upper bound for radius of gyration histogram. If ``None`` is given, no upper bound is set. Defaults to ``None``.
        bin_range_radius_of_gyration: The range a single histogram bin spans for the radius of gyration (e.g., 1 for 1 km bins). If ``None`` is given, the histogram bins will be determined automatically. Defaults to ``None``.
        max_user_tile_count: Upper bound for distinct tiles per user histogram. If ``None`` is given, no upper bound is set. Defaults to ``None``.
        bin_range_user_tile_count: The range a single histogram bin spans for the distinct tiles per user histogram. If ``None`` is given, the histogram bins will be determined automatically. Defaults to ``None``.
        max_user_time_delta:  Upper bound for user time delta histogram. If ``None`` is given, no upper bound is set. Defaults to ``None``.
        bin_range_user_time_delta: The range a single histogram bin spans for user time delta (e.g., 1 for 1 hour bins). If ``None`` is given, the histogram bins will be determined automatically. Defaults to ``None``.
        subtitle: Custom subtitle that appears at the top of the HTML report. Defaults to ``None``.
        disable_progress_bar: Whether progress bars should be shown. Defaults to ``False``.
        seed_sampling: Provide seed for down-sampling of dataset (according to ``max_trips_per_user``) so that the sampling is reproducible. Defaults to ``None``, i.e., no seed.
        evalu: Parameter only needed for development and evaluation purposes. Defaults to ``False``."""

    _report: dict = {}
    _html: str = ""
    _df: DataFrame
    _tessellation: Optional[GeoDataFrame]
    _privacy_budget: Optional[Union[int, float]]
    _max_trips_per_user: int
    _analysis_exclusion: list
    _budget_split: dict

    def __init__(
        self,
        df: DataFrame,
        tessellation: Optional[GeoDataFrame] = None,
        privacy_budget: Optional[Union[int, float]] = None,
        user_privacy: bool = True,
        max_trips_per_user: Optional[int] = None,
        analysis_selection: Optional[List[str]] = None,
        analysis_exclusion: Optional[List[str]] = None,
        budget_split: dict = {},
        timewindows: Union[List[int], np.ndarray] = [2, 6, 10, 14, 18, 22],
        max_travel_time: Optional[int] = None,
        bin_range_travel_time: Optional[int] = None,
        max_jump_length: Optional[Union[int, float]] = None,
        bin_range_jump_length: Optional[Union[int, float]] = None,
        max_radius_of_gyration: Optional[Union[int, float]] = None,
        bin_range_radius_of_gyration: Optional[Union[int, float]] = None,
        max_user_tile_count: Optional[int] = None,
        bin_range_user_tile_count: Optional[int] = None,
        max_user_time_delta: Optional[Union[int, float]] = None,
        bin_range_user_time_delta: Optional[Union[int, float]] = None,
        subtitle: str = None,
        disable_progress_bar: bool = False,
        seed_sampling: int = None,
        evalu: bool = False,
    ) -> None:
        preprocessing.validate_input(
            df,
            tessellation,
            privacy_budget,
            max_trips_per_user,
            analysis_selection,
            analysis_exclusion,
            budget_split,
            disable_progress_bar,
            evalu,
            user_privacy,
            timewindows,
            max_travel_time,
            bin_range_travel_time,
            max_jump_length,
            bin_range_jump_length,
            max_radius_of_gyration,
            bin_range_radius_of_gyration,
            max_user_tile_count,
            bin_range_user_tile_count,
            max_user_time_delta,
            bin_range_user_time_delta,
            seed_sampling,
        )

        (
            analysis_selection,
            analysis_exclusion,
        ) = preprocessing.validate_inclusion_exclusion(
            analysis_selection,
            analysis_exclusion,
        )

        self.user_privacy = user_privacy
        with tqdm(  # progress bar
            total=2, desc="Preprocess data", disable=disable_progress_bar
        ) as pbar:
            self._tessellation = (
                None
                if tessellation is None
                else preprocessing.preprocess_tessellation(tessellation)
            )
            pbar.update()

            self._max_trips_per_user = (
                max_trips_per_user
                if max_trips_per_user is not None
                else df.groupby(const.UID).nunique()[const.TID].max()
            )

            if user_privacy:
                self.count_sensitivity_base = self._max_trips_per_user
            else:
                self.count_sensitivity_base = 1
            self._df = preprocessing.preprocess_data(
                df.copy(),  # copy, to not overwrite users instance of df
                self.tessellation,
                self.max_trips_per_user,
                self.user_privacy,
                seed_sampling,
            )
            pbar.update()

        self._privacy_budget = None if privacy_budget is None else float(privacy_budget)
        self.max_travel_time = max_travel_time
        timewindows.sort()
        self.timewindows = (
            np.array(timewindows) if isinstance(timewindows, list) else timewindows
        )
        self.max_jump_length = max_jump_length
        self.bin_range_jump_length = bin_range_jump_length
        self.bin_range_travel_time = bin_range_travel_time
        self.max_radius_of_gyration = max_radius_of_gyration
        self.bin_range_radius_of_gyration = bin_range_radius_of_gyration
        self.max_user_tile_count = max_user_tile_count
        self.bin_range_user_tile_count = bin_range_user_tile_count
        self.max_user_time_delta = max_user_time_delta
        self.bin_range_user_time_delta = bin_range_user_time_delta
        self._analysis_exclusion = preprocessing.clean_analysis_exclusion(
            analysis_selection,
            analysis_exclusion,
            has_tessellation=(tessellation is not None),
            has_points_inside_tessellation=preprocessing.has_points_inside_tessellation(
                self.df, self.tessellation
            ),
            has_timestamps=pd.core.dtypes.common.is_datetime64_dtype(
                self.df[const.DATETIME]
            ),
            has_od_flows=max(self.df[const.TID].value_counts())
            > 1,  # are there trips with more than a single record?
            has_consecutive_user_trips=max(
                self.df.groupby(const.UID).nunique()[const.TID]
            )
            > 1,
        )
        self._budget_split = preprocessing.clean_budget_split(
            budget_split, self._analysis_exclusion
        )
        self.evalu = evalu
        self.disable_progress_bar = disable_progress_bar
        self.subtitle = subtitle

        # initialize parallel processing
        pandarallel.initialize(verbose=0)

    @property
    def report(self) -> dict:
        """A dictionary with all report elements (i.e., analyses)."""
        if not self._report:
            self._report = report.report_elements(self)
        return self._report

    @property
    def df(self) -> DataFrame:
        """DataFrame containing the processed input mobility data of the report."""
        return self._df

    @property
    def max_trips_per_user(self) -> int:
        """Maximum number of trips per user as specified in the parameters. If ``None`` was given, this equals the actual maximum according to the data."""
        return self._max_trips_per_user

    @property
    def budget_split(self) -> dict:
        """Budget split as specified in the parameters."""
        return self._budget_split

    @property
    def analysis_exclusion(self) -> list:
        """List of analyses that have been excluded from the report and similarity measures. If analysis selection was provided as a parameter, they are inverted to this ``analysis_exclusion`` parameter."""
        return self._analysis_exclusion

    @property
    def privacy_budget(self) -> Union[int, float]:
        """Privacy budget as specified in the parameters."""
        return self._privacy_budget

    @property
    def tessellation(self) -> GeoDataFrame:
        """Processed tessellation."""
        return self._tessellation

[docs]    def to_file(
        self,
        output_file: Union[str, Path],
        disable_progress_bar: Optional[bool] = None,
        top_n_flows: int = 100,
    ) -> None:
        """Write the report to a file.
        By default a name is generated.

        Args:
            output_file: The name or the path of the file to store the ``html`` output.
            disable_progress_bar: if ``False``, no progress bar is shown.
            top_n_flows: Determines how many of the top ``n`` origin-destination flows are displayed. Defaults to 100.
        """
        if disable_progress_bar is None:
            disable_progress_bar = self.disable_progress_bar

        if not isinstance(output_file, Path):
            output_file = Path(str(output_file))

        else:
            if output_file.suffix != ".html":
                suffix = output_file.suffix
                output_file = output_file.with_suffix(".html")
                warnings.warn(
                    f"Extension {suffix} not supported. For now we assume .html was intended. "
                    f"To remove this warning, please use .html or .json."
                )

        output_dir = Path(os.path.splitext(output_file)[0])
        filename = Path(os.path.basename(output_file)).stem

        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        create_html_assets(output_dir)

        # create report if not created yet (to display progress bar in correct order)
        self.report

        # render html
        data, temp_map_folder = render_html(
            self, filename, top_n_flows, disable_progress_bar
        )

        create_maps_folder(temp_map_folder, output_dir)

        # clean up temp folder
        rmtree(temp_map_folder, ignore_errors=True)

        output_file.write_text(data, encoding="utf-8")