Skip to content

markowitz.data.french

markowitz.data.french

Loader for Ken French factor datasets.

The Ken French data library distributes ZIP files containing CSV factor returns quoted in percent. We download via :mod:urllib, parse the CSV (skipping any header text and copyright footers), and divide by 100 so the returned frame is in decimal returns.

load_french_factors(dataset: str, *, frequency: str = 'monthly', cache_root: str | os.PathLike[str] | None = None) -> pd.DataFrame

Download (or read from cache) a Ken French factor table.

Parameters:

Name Type Description Default
dataset str

Dataset identifier, e.g. "F-F_Research_Data_Factors".

required
frequency str

"monthly" (default) or "daily".

'monthly'
cache_root str | PathLike[str] | None

Optional directory used to cache the raw ZIP payload.

None
Source code in src/markowitz/data/french.py
def load_french_factors(
    dataset: str,
    *,
    frequency: str = "monthly",
    cache_root: str | os.PathLike[str] | None = None,
) -> pd.DataFrame:
    """Download (or read from cache) a Ken French factor table.

    Parameters
    ----------
    dataset:
        Dataset identifier, e.g. ``"F-F_Research_Data_Factors"``.
    frequency:
        ``"monthly"`` (default) or ``"daily"``.
    cache_root:
        Optional directory used to cache the raw ZIP payload.
    """
    url = _build_url(dataset, frequency)
    cache_dir = Path(cache_root) if cache_root is not None else None
    cache_file: Path | None = None
    if cache_dir is not None:
        cache_dir.mkdir(parents=True, exist_ok=True)
        cache_file = cache_dir / f"{dataset}_{frequency}.zip"

    payload: bytes
    if cache_file is not None and cache_file.exists():
        payload = cache_file.read_bytes()
    else:
        try:
            with urllib.request.urlopen(url, timeout=30) as resp:
                payload = resp.read()
        except urllib.error.URLError as exc:
            raise ProviderUnavailableError(
                f"Kenneth French data library unreachable ({url}): {exc.reason}"
            ) from exc
        except Exception as exc:
            raise ProviderUnavailableError(
                f"Failed to download French factors from {url}: {exc}"
            ) from exc
        if cache_file is not None:
            cache_file.write_bytes(payload)

    try:
        with zipfile.ZipFile(io.BytesIO(payload)) as zf:
            csv_names = [n for n in zf.namelist() if n.lower().endswith(".csv")]
            if not csv_names:
                raise DataIntegrityError(f"no CSV in French archive {dataset}")
            text = zf.read(csv_names[0]).decode("latin-1")
    except zipfile.BadZipFile as exc:
        raise DataIntegrityError(f"French archive {dataset} is corrupt") from exc

    return _parse_french_csv(text, frequency=frequency)