diff --git a/example_model.ipynb b/example_model.ipynb index 0a62986..b3d6d63 100644 --- a/example_model.ipynb +++ b/example_model.ipynb @@ -11,20 +11,18 @@ }, { "cell_type": "code", + "execution_count": 1, "metadata": { + "ExecuteTime": { + "end_time": "2025-12-14T21:09:42.696957Z", + "start_time": "2025-12-14T21:09:42.549138Z" + }, "colab": { "base_uri": "https://localhost:8080/" }, "id": "Ekw8Z93ljC3v", - "outputId": "bdd16698-2ad0-4423-b090-c5ce55fe3053", - "ExecuteTime": { - "end_time": "2025-12-14T21:09:42.696957Z", - "start_time": "2025-12-14T21:09:42.549138Z" - } + "outputId": "bdd16698-2ad0-4423-b090-c5ce55fe3053" }, - "source": [ - "!python --version" - ], "outputs": [ { "name": "stdout", @@ -34,52 +32,79 @@ ] } ], - "execution_count": 1 + "source": [ + "!python --version" + ] }, { "cell_type": "code", + "execution_count": 2, "metadata": { + "ExecuteTime": { + "end_time": "2025-12-14T21:09:44.281889Z", + "start_time": "2025-12-14T21:09:42.698161Z" + }, "colab": { "base_uri": "https://localhost:8080/" }, "id": "yoy_wT1rhMqF", - "outputId": "e038b50f-1b61-4334-be62-28f4dc40a0a0", - "ExecuteTime": { - "end_time": "2025-12-14T21:09:44.281889Z", - "start_time": "2025-12-14T21:09:42.698161Z" - } + "outputId": "e038b50f-1b61-4334-be62-28f4dc40a0a0" }, - "source": [ - "# Install dependencies\n", - "!pip install -q --upgrade numerapi pandas pyarrow matplotlib lightgbm scikit-learn scipy cloudpickle==3.1.1" - ], "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\r\n", - "\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m A new release of pip is available: \u001B[0m\u001B[31;49m25.2\u001B[0m\u001B[39;49m -> \u001B[0m\u001B[32;49m25.3\u001B[0m\r\n", - "\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m To update, run: \u001B[0m\u001B[32;49mpip install --upgrade pip\u001B[0m\r\n" + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m25.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.3\u001b[0m\r\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\r\n" ] } ], - "execution_count": 2 + "source": [ + "# Install dependencies\n", + "!pip install -q --upgrade numerapi pandas pyarrow matplotlib lightgbm scikit-learn scipy cloudpickle==3.1.1" + ] }, { "cell_type": "code", + "execution_count": 3, "metadata": { + "ExecuteTime": { + "end_time": "2025-12-14T21:10:08.471862Z", + "start_time": "2025-12-14T21:09:44.283405Z" + }, "colab": { "base_uri": "https://localhost:8080/", "height": 160 }, "id": "13hdRk9ghMqI", - "outputId": "d2274374-fd85-4189-f27b-d9d466cc63ca", - "ExecuteTime": { - "end_time": "2025-12-14T21:10:08.471862Z", - "start_time": "2025-12-14T21:09:44.283405Z" - } + "outputId": "d2274374-fd85-4189-f27b-d9d466cc63ca" }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-12-14 13:09:45,386 INFO numerapi.utils: target file already exists\n", + "2025-12-14 13:09:45,387 INFO numerapi.utils: download complete\n", + "2025-12-14 13:09:46,291 INFO numerapi.utils: target file already exists\n", + "2025-12-14 13:09:46,291 INFO numerapi.utils: download complete\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001259 seconds.\n", + "You can set `force_row_wise=true` to remove the overhead.\n", + "And if memory is not enough, you can set `force_col_wise=true`.\n", + "[LightGBM] [Info] Total Bins 210\n", + "[LightGBM] [Info] Number of data points in the train set: 688184, number of used features: 42\n", + "[LightGBM] [Info] Start training from score 0.499946\n" + ] + } + ], "source": [ "from numerapi import NumerAPI\n", "import pandas as pd\n", @@ -155,32 +180,7 @@ " files.download('example_model.pkl')\n", "except:\n", " pass" - ], - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2025-12-14 13:09:45,386 INFO numerapi.utils: target file already exists\n", - "2025-12-14 13:09:45,387 INFO numerapi.utils: download complete\n", - "2025-12-14 13:09:46,291 INFO numerapi.utils: target file already exists\n", - "2025-12-14 13:09:46,291 INFO numerapi.utils: download complete\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001259 seconds.\n", - "You can set `force_row_wise=true` to remove the overhead.\n", - "And if memory is not enough, you can set `force_col_wise=true`.\n", - "[LightGBM] [Info] Total Bins 210\n", - "[LightGBM] [Info] Number of data points in the train set: 688184, number of used features: 42\n", - "[LightGBM] [Info] Start training from score 0.499946\n" - ] - } - ], - "execution_count": 3 + ] } ], "metadata": { diff --git a/feature_neutralization.ipynb b/feature_neutralization.ipynb index 9c2edd2..d75f1ed 100644 --- a/feature_neutralization.ipynb +++ b/feature_neutralization.ipynb @@ -8,33 +8,31 @@ "source": [ "# Feature Neutralization\n", "\n", - "One thing that makes predicting the stock market so hard is the \"non-stationary\" relationship between features and returns. Features can have strong predictive power some eras but not others - or may completely reverse over time.\n", + "Predicting the stock market is difficult due to the \"non-stationary\" relationship between features and returns. Features can have strong predictive power in some eras but not others—or may completely reverse over time.\n", "\n", - "This uncertainty is what we call \"feature risk\". In order to create models that have consistent performance, it is helpful to reduce this feature risk via \"feature neutralization\". In this notebook, we will:\n", + "This uncertainty is called \"feature risk.\" To create models with consistent performance, it helps to reduce feature risk via \"feature neutralization.\" In this notebook, we will:\n", "\n", - "1. Learn how to quantify feature risk\n", + "1. Learn to quantify feature risk\n", "2. Measure our model's feature exposure\n", "3. Apply feature neutralization to our predictions\n", - "4. Measure the performance of our neutralized predictions\n", + "4. Measure the performance of neutralized predictions\n", "5. Pickle and upload our feature-neutral model" ] }, { "cell_type": "code", + "execution_count": 1, "metadata": { + "ExecuteTime": { + "end_time": "2025-12-14T22:31:02.231100Z", + "start_time": "2025-12-14T22:31:02.072885Z" + }, "colab": { "base_uri": "https://localhost:8080/" }, "id": "ws4qrSssFC9T", - "outputId": "3860d6e5-38ec-4638-82b2-bce4c7365966", - "ExecuteTime": { - "end_time": "2025-12-14T22:31:02.231100Z", - "start_time": "2025-12-14T22:31:02.072885Z" - } + "outputId": "3860d6e5-38ec-4638-82b2-bce4c7365966" }, - "source": [ - "!python --version" - ], "outputs": [ { "name": "stdout", @@ -44,44 +42,46 @@ ] } ], - "execution_count": 1 + "source": [ + "!python --version" + ] }, { "cell_type": "code", + "execution_count": 2, "metadata": { + "ExecuteTime": { + "end_time": "2025-12-14T22:31:03.976313Z", + "start_time": "2025-12-14T22:31:02.232527Z" + }, "colab": { "base_uri": "https://localhost:8080/" }, "id": "iHzZde7Tyu-N", - "outputId": "f9cb52f5-88f3-4776-a1be-cef458e718f5", - "ExecuteTime": { - "end_time": "2025-12-14T22:31:03.976313Z", - "start_time": "2025-12-14T22:31:02.232527Z" - } + "outputId": "f9cb52f5-88f3-4776-a1be-cef458e718f5" }, - "source": [ - "# Install dependencies\n", - "!pip install -q --upgrade numerapi pandas pyarrow matplotlib lightgbm scikit-learn scipy cloudpickle==3.1.1\n", - "!pip install -q --no-deps numerai-tools\n", - "\n", - "# Inline plots\n", - "%matplotlib inline" - ], "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\r\n", - "\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m A new release of pip is available: \u001B[0m\u001B[31;49m25.2\u001B[0m\u001B[39;49m -> \u001B[0m\u001B[32;49m25.3\u001B[0m\r\n", - "\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m To update, run: \u001B[0m\u001B[32;49mpip install --upgrade pip\u001B[0m\r\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m25.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.3\u001b[0m\r\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\r\n", "\r\n", - "\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m A new release of pip is available: \u001B[0m\u001B[31;49m25.2\u001B[0m\u001B[39;49m -> \u001B[0m\u001B[32;49m25.3\u001B[0m\r\n", - "\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m To update, run: \u001B[0m\u001B[32;49mpip install --upgrade pip\u001B[0m\r\n" + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m25.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.3\u001b[0m\r\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\r\n" ] } ], - "execution_count": 2 + "source": [ + "# Install dependencies\n", + "!pip install -q --upgrade numerapi pandas pyarrow matplotlib lightgbm scikit-learn scipy cloudpickle==3.1.1\n", + "!pip install -q --no-deps numerai-tools\n", + "\n", + "# Inline plots\n", + "%matplotlib inline" + ] }, { "cell_type": "markdown", @@ -91,7 +91,7 @@ "source": [ "## 1. Feature Risk\n", "\n", - "In order to quantify feature risk, we evaluate the performance of each feature on their own." + "To quantify feature risk, we evaluate the performance of each feature individually." ] }, { @@ -101,66 +101,27 @@ }, "source": [ "### Feature Groups\n", - "In the last notebook, you learned about the basic feature sets that Numerai offers. There are also 8 feature groups: `intelligence`, `wisdom`, `charisma`, `dexterity`, `strength`, `constitution`, `agility`, `serenity`. Each group contains a different type of feature. For example all technical signals would be in one group, while all analyst predictions and ratings would be in another group.\n", "\n", - "Let us take a look at feature groups in the small, medium, and all feature sets:" + "In the previous notebook, you learned about the basic feature sets Numerai offers. There are also 8 feature groups: `intelligence`, `wisdom`, `charisma`, `dexterity`, `strength`, `constitution`, `agility`, and `serenity`. Each group contains a different type of feature—for example, all technical signals in one group, all analyst predictions and ratings in another.\n", + "\n", + "Let's examine feature groups across the small, medium, and all feature sets:" ] }, { "cell_type": "code", + "execution_count": 3, "metadata": { + "ExecuteTime": { + "end_time": "2025-12-14T22:31:04.680824Z", + "start_time": "2025-12-14T22:31:03.977656Z" + }, "colab": { "base_uri": "https://localhost:8080/", "height": 385 }, "id": "JTN8-MUmyu-P", - "outputId": "b8d0557f-ae8f-48e8-e707-806ac4683ad4", - "ExecuteTime": { - "end_time": "2025-12-14T22:31:04.680824Z", - "start_time": "2025-12-14T22:31:03.977656Z" - } + "outputId": "b8d0557f-ae8f-48e8-e707-806ac4683ad4" }, - "source": [ - "import json\n", - "import pandas as pd\n", - "from numerapi import NumerAPI\n", - "\n", - "# initialize our API client\n", - "napi = NumerAPI()\n", - "\n", - "# Set data version to one of the latest datasets\n", - "DATA_VERSION = \"v5.2\"\n", - "\n", - "napi.download_dataset(f\"{DATA_VERSION}/features.json\")\n", - "feature_metadata = json.load(open(f\"{DATA_VERSION}/features.json\"))\n", - "feature_sets = feature_metadata[\"feature_sets\"]\n", - "\n", - "sizes = [\"small\", \"medium\", \"all\"]\n", - "groups = [\n", - " \"intelligence\",\n", - " \"wisdom\",\n", - " \"charisma\",\n", - " \"dexterity\",\n", - " \"strength\",\n", - " \"constitution\",\n", - " \"agility\",\n", - " \"serenity\",\n", - " \"all\"\n", - "]\n", - "\n", - "# compile the intersections of feature sets and feature groups\n", - "subgroups = {}\n", - "for size in sizes:\n", - " subgroups[size] = {}\n", - " for group in groups:\n", - " subgroups[size][group] = (\n", - " set(feature_sets[size])\n", - " .intersection(set(feature_sets[group]))\n", - " )\n", - "\n", - "# convert to data frame and display the feature count of each intersection\n", - "pd.DataFrame(subgroups).applymap(len).sort_values(by=\"all\", ascending=False)" - ], "outputs": [ { "name": "stderr", @@ -174,18 +135,6 @@ }, { "data": { - "text/plain": [ - " small medium all\n", - "all 42 780 2748\n", - "constitution 2 134 335\n", - "charisma 3 116 290\n", - "agility 2 58 145\n", - "wisdom 3 56 140\n", - "strength 1 54 135\n", - "serenity 3 34 95\n", - "dexterity 4 21 51\n", - "intelligence 2 14 35" - ], "text/html": [ "