Source code for dp_mobility_report.dpmreport

import os
import warnings
from pathlib import Path
from shutil import rmtree
from typing import List, Optional, Union

import numpy as np
import pandas as pd
from geopandas import GeoDataFrame
from pandarallel import pandarallel
from pandas import DataFrame
from tqdm.auto import tqdm

from dp_mobility_report import constants as const
from dp_mobility_report.model import preprocessing
from dp_mobility_report.report import report
from dp_mobility_report.report.html.templates import (
    create_html_assets,
    create_maps_folder,
    render_html,
)


[docs]class DpMobilityReport: """Generate a (differentially private) mobility report from a mobility dataset. The report will be generated as an HTML file, using the ``.to_file()`` method. Args: df: ``DataFrame`` containing the mobility data. Expected columns: User ID ``uid``, trip ID ``tid``, timestamp ``datetime`` (or ``int`` to indicate sequence position, if dataset only consists of sequences without timestamps), latitude ``lat`` and longitude ``lng`` in CRS EPSG:4326. tessellation: Geopandas ``GeoDataFrame`` containing the tessellation for spatial aggregations. Expected columns: ``tile_id``. If tessellation is not provided in the expected default CRS EPSG:4326, it will automatically be transformed. If no tessellation is provided, all analyses based on the tessellation will automatically be removed. privacy_budget: privacy_budget for the differentially private report. Defaults to ``None``, i.e., no privacy guarantee is provided. user_privacy: Whether item-level or user-level privacy is applied. Defaults to ``True`` (user-level privacy). max_trips_per_user: Maximum number of trips a user is allowed to contribute to the data. Dataset will be sampled accordingly. Defaults to ``None``, i.e., all trips are used. This implies that the actual maximum number of trips per user will be used according to the data, though this violates user-level Differential Privacy. analysis_selection: Select only needed analyses. A selection reduces computation time and leaves more privacy budget for higher accuracy of other analyses. ``analysis_selection`` takes a list of all analyses to be included. Alternatively, a list of analyses to be excluded can be set with ``analysis_exclusion``. Either entire segments can be included: ``const.OVERVIEW``, ``const.PLACE_ANALYSIS``, ``const.OD_ANALYSIS``, ``const.USER_ANALYSIS`` or any single analysis can be included: ``const.DS_STATISTICS``, ``const.MISSING_VALUES``, ``const.TRIPS_OVER_TIME``, ``const.TRIPS_PER_WEEKDAY``, ``const.TRIPS_PER_HOUR``, ``const.VISITS_PER_TILE``, ``const.VISITS_PER_TIME_TILE``, ``const.OD_FLOWS``, ``const.TRAVEL_TIME``, ``const.JUMP_LENGTH``, ``const.TRIPS_PER_USER``, ``const.USER_TIME_DELTA``, ``const.RADIUS_OF_GYRATION``, ``const.USER_TILE_COUNT``, ``const.MOBILITY_ENTROPY``. Default is None, i.e., all analyses are included. analysis_exclusion: Ignored, if ``analysis_selection`` is set! ``analysis_exclusion`` takes a list of all analyses to be excluded. Either entire segments can be excluded: ``const.OVERVIEW``, ``const.PLACE_ANALYSIS``, ``const.OD_ANALYSIS``, ``const.USER_ANALYSIS`` or any single analysis can be excluded: ``const.DS_STATISTICS``, ``const.MISSING_VALUES``, ``const.TRIPS_OVER_TIME``, ``const.TRIPS_PER_WEEKDAY``, ``const.TRIPS_PER_HOUR``, ``const.VISITS_PER_TILE``, ``const.VISITS_PER_TIME_TILE``, ``const.OD_FLOWS``, ``const.TRAVEL_TIME``, ``const.JUMP_LENGTH``, ``const.TRIPS_PER_USER``, ``const.USER_TIME_DELTA``, ``const.RADIUS_OF_GYRATION``, ``const.USER_TILE_COUNT``, ``const.MOBILITY_ENTROPY`` budget_split: ``dict`` to customize how much privacy budget is assigned to which analysis. Each key needs to be named according to an analysis and the value needs to be an integer indicating the weight for the privacy budget. If no weight is assigned, a default weight of 1 is set. For example, if ``budget_split = {const.VISITS_PER_TILE: 10}``, then the privacy budget for ``visits_per_tile`` is 10 times higher than for every other analysis, which all get a default weight of 1. Possible ``dict`` keys (all analyses): ``const.DS_STATISTICS``, ``const.MISSING_VALUES``, ``const.TRIPS_OVER_TIME``, ``const.TRIPS_PER_WEEKDAY``, ``const.TRIPS_PER_HOUR``, ``const.VISITS_PER_TILE``, ``const.VISITS_PER_TIME_TILE``, ``const.OD_FLOWS``, ``const.TRAVEL_TIME``, ``const.JUMP_LENGTH``, ``const.TRIPS_PER_USER``, ``const.USER_TIME_DELTA``, ``const.RADIUS_OF_GYRATION``, ``const.USER_TILE_COUNT``, ``const.MOBILITY_ENTROPY`` timewindows: List of hours as ``int`` that define the timewindows for the spatial analysis for single time windows. Defaults to ``[2, 6, 10, 14, 18, 22]``. max_travel_time: Upper bound for travel time histogram. If ``None`` is given, no upper bound is set. Defaults to ``None``. bin_range_travel_time: The range a single histogram bin spans for travel time (e.g., 5 for 5 min bins). If ``None`` is given, the histogram bins will be determined automatically. Defaults to ``None``. max_jump_length: Upper bound for jump length histogram. If ``None`` is given, no upper bound is set. Defaults to ``None``. bin_range_jump_length: The range a single histogram bin spans for jump length (e.g., 1 for 1 km bins). If ``None`` is given, the histogram bins will be determined automatically. Defaults to ``None``. max_radius_of_gyration: Upper bound for radius of gyration histogram. If ``None`` is given, no upper bound is set. Defaults to ``None``. bin_range_radius_of_gyration: The range a single histogram bin spans for the radius of gyration (e.g., 1 for 1 km bins). If ``None`` is given, the histogram bins will be determined automatically. Defaults to ``None``. max_user_tile_count: Upper bound for distinct tiles per user histogram. If ``None`` is given, no upper bound is set. Defaults to ``None``. bin_range_user_tile_count: The range a single histogram bin spans for the distinct tiles per user histogram. If ``None`` is given, the histogram bins will be determined automatically. Defaults to ``None``. max_user_time_delta: Upper bound for user time delta histogram. If ``None`` is given, no upper bound is set. Defaults to ``None``. bin_range_user_time_delta: The range a single histogram bin spans for user time delta (e.g., 1 for 1 hour bins). If ``None`` is given, the histogram bins will be determined automatically. Defaults to ``None``. subtitle: Custom subtitle that appears at the top of the HTML report. Defaults to ``None``. disable_progress_bar: Whether progress bars should be shown. Defaults to ``False``. seed_sampling: Provide seed for down-sampling of dataset (according to ``max_trips_per_user``) so that the sampling is reproducible. Defaults to ``None``, i.e., no seed. evalu: Parameter only needed for development and evaluation purposes. Defaults to ``False``.""" _report: dict = {} _html: str = "" _df: DataFrame _tessellation: Optional[GeoDataFrame] _privacy_budget: Optional[Union[int, float]] _max_trips_per_user: int _analysis_exclusion: list _budget_split: dict def __init__( self, df: DataFrame, tessellation: Optional[GeoDataFrame] = None, privacy_budget: Optional[Union[int, float]] = None, user_privacy: bool = True, max_trips_per_user: Optional[int] = None, analysis_selection: Optional[List[str]] = None, analysis_exclusion: Optional[List[str]] = None, budget_split: dict = {}, timewindows: Union[List[int], np.ndarray] = [2, 6, 10, 14, 18, 22], max_travel_time: Optional[int] = None, bin_range_travel_time: Optional[int] = None, max_jump_length: Optional[Union[int, float]] = None, bin_range_jump_length: Optional[Union[int, float]] = None, max_radius_of_gyration: Optional[Union[int, float]] = None, bin_range_radius_of_gyration: Optional[Union[int, float]] = None, max_user_tile_count: Optional[int] = None, bin_range_user_tile_count: Optional[int] = None, max_user_time_delta: Optional[Union[int, float]] = None, bin_range_user_time_delta: Optional[Union[int, float]] = None, subtitle: str = None, disable_progress_bar: bool = False, seed_sampling: int = None, evalu: bool = False, ) -> None: preprocessing.validate_input( df, tessellation, privacy_budget, max_trips_per_user, analysis_selection, analysis_exclusion, budget_split, disable_progress_bar, evalu, user_privacy, timewindows, max_travel_time, bin_range_travel_time, max_jump_length, bin_range_jump_length, max_radius_of_gyration, bin_range_radius_of_gyration, max_user_tile_count, bin_range_user_tile_count, max_user_time_delta, bin_range_user_time_delta, seed_sampling, ) ( analysis_selection, analysis_exclusion, ) = preprocessing.validate_inclusion_exclusion( analysis_selection, analysis_exclusion, ) self.user_privacy = user_privacy with tqdm( # progress bar total=2, desc="Preprocess data", disable=disable_progress_bar ) as pbar: self._tessellation = ( None if tessellation is None else preprocessing.preprocess_tessellation(tessellation) ) pbar.update() self._max_trips_per_user = ( max_trips_per_user if max_trips_per_user is not None else df.groupby(const.UID).nunique()[const.TID].max() ) if user_privacy: self.count_sensitivity_base = self._max_trips_per_user else: self.count_sensitivity_base = 1 self._df = preprocessing.preprocess_data( df.copy(), # copy, to not overwrite users instance of df self.tessellation, self.max_trips_per_user, self.user_privacy, seed_sampling, ) pbar.update() self._privacy_budget = None if privacy_budget is None else float(privacy_budget) self.max_travel_time = max_travel_time timewindows.sort() self.timewindows = ( np.array(timewindows) if isinstance(timewindows, list) else timewindows ) self.max_jump_length = max_jump_length self.bin_range_jump_length = bin_range_jump_length self.bin_range_travel_time = bin_range_travel_time self.max_radius_of_gyration = max_radius_of_gyration self.bin_range_radius_of_gyration = bin_range_radius_of_gyration self.max_user_tile_count = max_user_tile_count self.bin_range_user_tile_count = bin_range_user_tile_count self.max_user_time_delta = max_user_time_delta self.bin_range_user_time_delta = bin_range_user_time_delta self._analysis_exclusion = preprocessing.clean_analysis_exclusion( analysis_selection, analysis_exclusion, has_tessellation=(tessellation is not None), has_points_inside_tessellation=preprocessing.has_points_inside_tessellation( self.df, self.tessellation ), has_timestamps=pd.core.dtypes.common.is_datetime64_dtype( self.df[const.DATETIME] ), has_od_flows=max(self.df[const.TID].value_counts()) > 1, # are there trips with more than a single record? has_consecutive_user_trips=max( self.df.groupby(const.UID).nunique()[const.TID] ) > 1, ) self._budget_split = preprocessing.clean_budget_split( budget_split, self._analysis_exclusion ) self.evalu = evalu self.disable_progress_bar = disable_progress_bar self.subtitle = subtitle # initialize parallel processing pandarallel.initialize(verbose=0) @property def report(self) -> dict: """A dictionary with all report elements (i.e., analyses).""" if not self._report: self._report = report.report_elements(self) return self._report @property def df(self) -> DataFrame: """DataFrame containing the processed input mobility data of the report.""" return self._df @property def max_trips_per_user(self) -> int: """Maximum number of trips per user as specified in the parameters. If ``None`` was given, this equals the actual maximum according to the data.""" return self._max_trips_per_user @property def budget_split(self) -> dict: """Budget split as specified in the parameters.""" return self._budget_split @property def analysis_exclusion(self) -> list: """List of analyses that have been excluded from the report and similarity measures. If analysis selection was provided as a parameter, they are inverted to this ``analysis_exclusion`` parameter.""" return self._analysis_exclusion @property def privacy_budget(self) -> Union[int, float]: """Privacy budget as specified in the parameters.""" return self._privacy_budget @property def tessellation(self) -> GeoDataFrame: """Processed tessellation.""" return self._tessellation
[docs] def to_file( self, output_file: Union[str, Path], disable_progress_bar: Optional[bool] = None, top_n_flows: int = 100, ) -> None: """Write the report to a file. By default a name is generated. Args: output_file: The name or the path of the file to store the ``html`` output. disable_progress_bar: if ``False``, no progress bar is shown. top_n_flows: Determines how many of the top ``n`` origin-destination flows are displayed. Defaults to 100. """ if disable_progress_bar is None: disable_progress_bar = self.disable_progress_bar if not isinstance(output_file, Path): output_file = Path(str(output_file)) else: if output_file.suffix != ".html": suffix = output_file.suffix output_file = output_file.with_suffix(".html") warnings.warn( f"Extension {suffix} not supported. For now we assume .html was intended. " f"To remove this warning, please use .html or .json." ) output_dir = Path(os.path.splitext(output_file)[0]) filename = Path(os.path.basename(output_file)).stem if not os.path.exists(output_dir): os.makedirs(output_dir) create_html_assets(output_dir) # create report if not created yet (to display progress bar in correct order) self.report # render html data, temp_map_folder = render_html( self, filename, top_n_flows, disable_progress_bar ) create_maps_folder(temp_map_folder, output_dir) # clean up temp folder rmtree(temp_map_folder, ignore_errors=True) output_file.write_text(data, encoding="utf-8")