Resources
Data randomizer for TensorFlow.js neuronal networks
The way to Feature Engineering

Inputs

A Decision Model and Notation -DMN- file is a XML file, which is editable by means of bpmn.io. It includes one or more decision tables (left-hand side), whose inside expresses some business logic (right-hand side).

As XML

<?xml version="1.0" encoding="UTF-8"?>
<definitions xmlns="https://www.omg.org/spec/DMN/20191111/MODEL/" xmlns:dmndi="https://www.omg.org/spec/DMN/20191111/DMNDI/" xmlns:dc="http://www.omg.org/spec/DMN/20180521/DC/" xmlns:di="http://www.omg.org/spec/DMN/20180521/DI/" id="Get_barcode_EAN_13_country" name="Get barcode (EAN 13) country" namespace="http://camunda.org/schema/1.0/dmn" exporter="Camunda Modeler" exporterVersion="5.19.0">
    <decision id="decision_1ssfotj" name="Get barcode (EAN 13) country">
        <informationRequirement id="InformationRequirement_1tka5z8">
            <requiredInput href="#Barcode_EAN_13"/>
        </informationRequirement>
        <decisionTable id="decisionTable_01leozv" hitPolicy="FIRST">
            <input id="input1" label="Barcode (EAN 13)">
                <inputExpression id="inputExpression1" typeRef="string">
                    <text></text>
                </inputExpression>
            </input>
            <output id="output1" label="Country" name="" typeRef="string">
                <outputValues id="UnaryTests_02l74zr">
                    <text>"France-Monaco","Portugal","Japan","Russia","Austria","AT"</text>
                </outputValues>
            </output>
            <rule id="DecisionRule_0jz55mz">
                <description>https://en.wikipedia.org/wiki/List_of_GS1_country_codes

                    alternative FEEL formula: string length(?)=13 and starts with(?, "3") and not(starts with(?, "38")) and not(starts with(?, "39"))</description>
                <inputEntry id="UnaryTests_1heotr2">
                    <text>string length(?)=13 and starts with(?, "3") and not (matches(?, "^3[89]"))</text>
                </inputEntry>
                <outputEntry id="LiteralExpression_0n357ds">
                    <text>"France-Monaco"</text>
                </outputEntry>
            </rule>
            <rule id="DecisionRule_17lwmto">
                <description>https://en.wikipedia.org/wiki/List_of_GS1_country_codes</description>
                <inputEntry id="UnaryTests_05tamb8">
                    <text>matches(?, "^46\d{11}$")</text>
                </inputEntry>
                <outputEntry id="LiteralExpression_0igo4te">
                    <text>"Russia"</text>
                </outputEntry>
            </rule>
            <rule id="DecisionRule_1dd4yek">
                <description></description>
                <inputEntry id="UnaryTests_1i7g6md">
                    <text>string length(?)=13 and starts with(?, "560")</text>
                </inputEntry>
                <outputEntry id="LiteralExpression_1wnpj3d">
                    <text>"Portugal"</text>
                </outputEntry>
            </rule>
            <rule id="DecisionRule_01z8rju">
                <inputEntry id="UnaryTests_12b599r">
                    <text>matches(?, "^9[0-1]{1}\d{10}$")</text>
                </inputEntry>
                <outputEntry id="LiteralExpression_11c8ort">
                    <text>"AT"</text>
                </outputEntry>
            </rule>
            <rule id="DecisionRule_132guif">
                <inputEntry id="UnaryTests_047vd6x">
                    <text>matches(?, "^9[0-1]{1}\d{10}$")</text>
                </inputEntry>
                <outputEntry id="LiteralExpression_11env8p">
                    <text>"Austria"</text>
                </outputEntry>
            </rule>
        </decisionTable>
    </decision>
    <inputData id="Barcode_EAN_13" name="Barcode (EAN 13)"/>
    <!-- Graphical data -->
</definitions>

As JSON

A DMN file is processed from XML to provide its inside in JSON by means of dmn-moddle library (see project on GitHub).

{
  "$type": "dmn:Definitions",
  "id": "Get_barcode_EAN_13_country",
  "name": "Get barcode (EAN 13) country",
  "namespace": "http://camunda.org/schema/1.0/dmn",
  "exporter": "Camunda Modeler",
  "exporterVersion": "5.19.0",
  "drgElement": [
    {
      "$type": "dmn:Decision",
      "id": "decision_1ssfotj",
      "name": "Get barcode (EAN 13) country",
      "informationRequirement": [
        {
          "$type": "dmn:InformationRequirement",
          "id": "InformationRequirement_1tka5z8",
          "requiredInput": {
            "$type": "dmn:DMNElementReference",
            "href": "#Barcode_EAN_13"
          }
        }
      ],
      "decisionLogic": {
        "$type": "dmn:DecisionTable",
        "id": "decisionTable_01leozv",
        "hitPolicy": "FIRST",
        "input": [
          {
            "$type": "dmn:InputClause",
            "id": "input1",
            "label": "Barcode (EAN 13)",
            "inputExpression": {
              "$type": "dmn:LiteralExpression",
              "id": "inputExpression1",
              "typeRef": "string",
              "text": ""
            }
          }
        ],
        "output": [
          {
            "$type": "dmn:OutputClause",
            "id": "output1",
            "label": "Country",
            "name": "",
            "typeRef": "string",
            "outputValues": {
              "$type": "dmn:UnaryTests",
              "id": "UnaryTests_02l74zr",
              "text": "\"France-Monaco\",\"Portugal\",\"Japan\",\"Russia\",\"Austria\",\"AT\""
            }
          }
        ],
        "rule": [
          {
            "$type": "dmn:DecisionRule",
            "id": "DecisionRule_0jz55mz",
            "description": "https://en.wikipedia.org/wiki/List_of_GS1_country_codes\n\nalternative FEEL formula: string length(?)=13 and starts with(?, \"3\") and not(starts with(?, \"38\")) and not(starts with(?, \"39\"))",
            "inputEntry": [
              {
                "$type": "dmn:UnaryTests",
                "id": "UnaryTests_1heotr2",
                "text": "string length(?)=13 and starts with(?, \"3\") and not (matches(?, \"^3[89]\"))"
              }
            ],
            "outputEntry": [
              {
                "$type": "dmn:LiteralExpression",
                "id": "LiteralExpression_0n357ds",
                "text": "\"France-Monaco\""
              }
            ]
          },
          {
            "$type": "dmn:DecisionRule",
            "id": "DecisionRule_17lwmto",
            "description": "https://en.wikipedia.org/wiki/List_of_GS1_country_codes",
            "inputEntry": [
              {
                "$type": "dmn:UnaryTests",
                "id": "UnaryTests_05tamb8",
                "text": "matches(?, \"^46\\d{11}$\")"
              }
            ],
            "outputEntry": [
              {
                "$type": "dmn:LiteralExpression",
                "id": "LiteralExpression_0igo4te",
                "text": "\"Russia\""
              }
            ]
          },
          {
            "$type": "dmn:DecisionRule",
            "id": "DecisionRule_1dd4yek",
            "description": "",
            "inputEntry": [
              {
                "$type": "dmn:UnaryTests",
                "id": "UnaryTests_1i7g6md",
                "text": "string length(?)=13 and starts with(?, \"560\")"
              }
            ],
            "outputEntry": [
              {
                "$type": "dmn:LiteralExpression",
                "id": "LiteralExpression_1wnpj3d",
                "text": "\"Portugal\""
              }
            ]
          },
          {
            "$type": "dmn:DecisionRule",
            "id": "DecisionRule_01z8rju",
            "inputEntry": [
              {
                "$type": "dmn:UnaryTests",
                "id": "UnaryTests_12b599r",
                "text": "matches(?, \"^9[0-1]{1}\\d{10}$\")"
              }
            ],
            "outputEntry": [
              {
                "$type": "dmn:LiteralExpression",
                "id": "LiteralExpression_11c8ort",
                "text": "\"AT\""
              }
            ]
          },
          {
            "$type": "dmn:DecisionRule",
            "id": "DecisionRule_132guif",
            "inputEntry": [
              {
                "$type": "dmn:UnaryTests",
                "id": "UnaryTests_047vd6x",
                "text": "matches(?, \"^9[0-1]{1}\\d{10}$\")"
              }
            ],
            "outputEntry": [
              {
                "$type": "dmn:LiteralExpression",
                "id": "LiteralExpression_11env8p",
                "text": "\"Austria\""
              }
            ]
          }
        ]
      }
    },
    {
      "$type": "dmn:InputData",
      "id": "Barcode_EAN_13",
      "name": "Barcode (EAN 13)"
    }
  ],
  … // No need: drawing graphical coordinates
}

The prior JSON object, say diagram, may be processed in TypeScript by means of functions and types offered in ts/common/Settings.ts (see project on GitHub).

diagram.drgElement.filter(Is_DMN_Decision).forEach((decision: DMN_Decision) => {
    if(Is_DMN_DecisionTable(decision.decisionLogic)) {…}
});

Outputs

A data randomizer has to produce a data set as follows.

{
  "status": "RANDOMIZED",
  "data": [
    {
      "Barcode (EAN 13)": "3274080005003",
      "Country": "France-Monaco"
    },
    {
      "Barcode (EAN 13)": "5603722493942",
      "Country": "Portugal"
    },
    {
      "Barcode (EAN 13)": "4606453849072",
      "Country": "Russia"
    },
    // Etc. "data" array length results from "size" attribute below...
  ]
}
Realistic example

Data source

Data sources on the Web are raw data. For example, data from https://nudger.fr/opendata/gtin-open-data.zip, once unzipped, obey the following CSV structure (1000-item extract ).

code,"brand","model","name","last_updated","gs1_country","gtinType","offers_count","min_price","min_price_compensation","currency","categories","url"
9110200705009,"OTHER",,"Lit Adulte - Lit à sommier tapissier avec matelas moelleux - Meuble de Chambre à Coucher - Noir 180x200 cm Tissu -MN28763","1706101081492","AT","GTIN_13","1","673.99","2.6959600000000004","EUR","",""
9110200704170,"OTHER",,"Lit Adulte - Lit à sommier tapissier avec matelas moelleux - Meuble de Chambre à Coucher - Noir 160x200 cm Tissu -MN83316","1706101081497","AT","GTIN_13","1","563.99","2.25596","EUR","",""
9110200704187,"OTHER",,"Lit Adulte - Lit à sommier tapissier avec matelas moelleux - Meuble de Chambre à Coucher - Noir 160x200 cm Tissu -MN90540","1706101081500","AT","GTIN_13","1","640.99","2.5639600000000002","EUR","",""
9110200704620,"OTHER",,"Lit Adulte - Lit à sommier tapissier avec matelas moelleux - Meuble de Chambre à Coucher - Noir 160x200 cm Tissu -MN59666","1706101081491","AT","GTIN_13","1","654.99","2.6199600000000003","EUR","",""

Within the randomization process, code in data source has to feed "Barcode (EAN 13)" in DMN file, "gtinType" in data source has to match "GTIN_13" in DMN file, and "gs1_country" in data source has to feed "Country" in DMN file.

The following DMN file indicates in first column a predefined tailor-made Web service, which is capable of carrying out the randomization process, i.e., delivering randomized data.

Work

  1. For the Get barcode (EAN 13) country DMN diagram above, construct a tailor-made Web service that randomizes data from https://nudger.fr/opendata/gtin-open-data.zip
  2. For the Get barcode (EAN 13) country DMN diagram above, construct a tailor-made Web service that randomizes data from https://static.openfoodfacts.org/data/en.openfoodfacts.org.products.csv.gz
  3. Construct a generic (POST) Web service, which consumes as input a DMN file ("XML" attribute below) and delegates randomization to third-party Web services, which encapsulate data specificity, like compression types, internal data formats between CSV, TSV, Parquet, JSON Lines, XML, or RDF (optional)… Significant data volumes, varied compression types, heterogeneous formats… require using the Web streams Application Programming Interface -API- and devoted libraries like JSZip, Papa Parse, d3-dsv, stream-json, node-tar, JSONStream here… and there
  4. From the incoming DMN diagram, say X.dmn, generate a X.schema.json JSON Schema file for expressing and controlling the expected format of randomized data
  5. Construct your own case (see data sets and case studies below)
{
    "size": 1000,
    "XML": "<?xml version="1.0" encoding="UTF-8"?>
<definitions xmlns="https://www.omg.org/spec/DMN/20191111/MODEL/" xmlns:dmndi="https://www.omg.org/spec/DMN/20191111/DMNDI/" xmlns:dc="http://www.omg.org/spec/DMN/20180521/DC/" xmlns:modeler="http://camunda.org/schema/modeler/1.0" xmlns:biodi="http://bpmn.io/schema/dmn/biodi/2.0" id="Definitions_0iveoyx" name="DRD" namespace="http://camunda.org/schema/1.0/dmn" exporter="Camunda Modeler" exporterVersion="5.26.0" modeler:executionPlatform="Camunda Cloud" modeler:executionPlatformVersion="8.4.0">
    <decision id="Decision_1h7nlw3" name="Get barcode (EAN 13) country">
        <decisionTable id="DecisionTable_08nmjry">
            <input id="Input_1" label="Web service">
                <inputExpression id="InputExpression_1" typeRef="string">
                    <text>"FranckBarbier.com/API/nudger.fr/opendata/gtin-open-data.zip"</text>
                </inputExpression>
            </input>
        …"
}

The generic (POST) responding Web service has to generate, say 1000 items, based on data coming from data sources whose “complexity” calls for a supplier tailor-made Web service in charge of extraction.

Node.js architecture issues

Help!

Data sets

  1. French INSEE
  2. French real estate
  3. French transportation
  4. French weather forecast
  5. Kaggle
  6. OpenML
  7. Etc.

Case studies

  1. Data about daily forecast for 16 days ahead is available here… Concrete file sample (date is Jan. 18, 1970) as illustration is here… (22,635 cities). Note that data are in JSON format while professional subscription to the API delivers data in CSV format.
  2. Social attitude in female/male voice . Implementation in progress within project on GitHub
  3. UFOs here… and there
  4. Damaged cars: here
  5. Car accident scenarios: here
  6. Etc.

MicroSoft tutorial on machine leaning here

Subtlety

Data may to be found from good-score string matching. For example, to populate Barcode (EAN 13) with randomized values, a match can be computed between Barcode (EAN 13) and each column name in CSV data: code, "brand"… Matching may be scored by means of Fuse.js or any equivalent library. Logically, code should have the best score relative to Barcode (EAN 13); values of code attribute populate Barcode (EAN 13) in output. The same applies for "gs1_country" column name and Country in output, etc.