Split.java

/*
 * Copyright © 2014 - 2021 Leipzig University (Database Research Group)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.gradoop.flink.model.impl.operators.split;

import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.tuple.Tuple1;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.tuple.Tuple3;
import org.gradoop.common.model.api.entities.Edge;
import org.gradoop.common.model.api.entities.GraphHead;
import org.gradoop.common.model.api.entities.Vertex;
import org.gradoop.common.model.impl.id.GradoopId;
import org.gradoop.common.model.impl.id.GradoopIdSet;
import org.gradoop.common.model.impl.properties.PropertyValue;
import org.gradoop.flink.model.api.epgm.BaseGraph;
import org.gradoop.flink.model.api.epgm.BaseGraphCollection;
import org.gradoop.flink.model.api.functions.Function;
import org.gradoop.flink.model.api.operators.UnaryBaseGraphToBaseGraphCollectionOperator;
import org.gradoop.flink.model.impl.functions.epgm.Id;
import org.gradoop.flink.model.impl.functions.epgm.InitGraphHead;
import org.gradoop.flink.model.impl.functions.epgm.PairTupleWithNewId;
import org.gradoop.flink.model.impl.functions.epgm.SourceId;
import org.gradoop.flink.model.impl.functions.tuple.Project2To1;
import org.gradoop.flink.model.impl.operators.split.functions.AddNewGraphsToEdge;
import org.gradoop.flink.model.impl.operators.split.functions.AddNewGraphsToVertex;
import org.gradoop.flink.model.impl.operators.split.functions.JoinEdgeTupleWithSourceGraphs;
import org.gradoop.flink.model.impl.operators.split.functions.JoinEdgeTupleWithTargetGraphs;
import org.gradoop.flink.model.impl.operators.split.functions.JoinVertexIdWithGraphIds;
import org.gradoop.flink.model.impl.operators.split.functions.MultipleGraphIdsGroupReducer;
import org.gradoop.flink.model.impl.operators.split.functions.SplitValues;

import java.io.Serializable;
import java.util.List;

/**
 * Splits a LogicalGraph into a GraphCollection based on user-defined property values. The operator supports
 * overlapping logical graphs, where a vertex can be in more than one logical graph. Edges, where source and
 * target vertex have no graphs in common, are removed from the resulting collection.
 *
 * @param <G>  The graph head type.
 * @param <V>  The vertex type.
 * @param <E>  The edge type.
 * @param <LG> The type of the graph.
 * @param <GC> The type of the graph collection.
 */
public class Split<
  G extends GraphHead,
  V extends Vertex,
  E extends Edge,
  LG extends BaseGraph<G, V, E, LG, GC>,
  GC extends BaseGraphCollection<G, V, E, LG, GC>>
  implements UnaryBaseGraphToBaseGraphCollectionOperator<LG, GC>, Serializable {

  /**
   * User-defined function for value extraction
   */
  private final Function<V, List<PropertyValue>> function;

  /**
   * Constructor
   *
   * @param function user-defined function
   */
  public Split(Function<V, List<PropertyValue>> function) {
    this.function = function;
  }

  @Override
  public GC execute(LG graph) {

    //--------------------------------------------------------------------------
    // compute vertices
    //--------------------------------------------------------------------------

    // build tuples of vertex and split value, which will determine in which
    // new graph the vertices lie
    DataSet<Tuple2<GradoopId, PropertyValue>> vertexIdWithSplitValues =
      graph.getVertices()
        .flatMap(new SplitValues<>(function));

    // extract the split properties into a dataset
    DataSet<Tuple1<PropertyValue>> distinctSplitValues = vertexIdWithSplitValues
      .map(new Project2To1<>())
      .distinct();

    // generate one new unique GraphId per distinct split property
    DataSet<Tuple2<PropertyValue, GradoopId>> splitValuesWithGraphIds =
      distinctSplitValues
        .map(new PairTupleWithNewId<>());

    // build a dataset of the vertex ids and the new associated graph ids
    DataSet<Tuple2<GradoopId, GradoopIdSet>> vertexIdWithGraphIds =
      vertexIdWithSplitValues
        .join(splitValuesWithGraphIds)
        .where(1).equalTo(0)
        .with(new JoinVertexIdWithGraphIds())
        .groupBy(0)
        .reduceGroup(new MultipleGraphIdsGroupReducer());

    // add new graph ids to the initial vertex set
    DataSet<V> vertices = graph.getVertices()
      .join(vertexIdWithGraphIds)
      .where(new Id<>()).equalTo(0)
      .with(new AddNewGraphsToVertex<>());

    //--------------------------------------------------------------------------
    // compute new graphs
    //--------------------------------------------------------------------------

    // extract graph ids into a dataset
    DataSet<Tuple1<GradoopId>> newGraphIds = splitValuesWithGraphIds
      .map(new Project2To1<>());

    // add new graph id's to the initial graph set
    DataSet<G> newGraphs = newGraphIds
      .map(new InitGraphHead<>(graph.getFactory().getGraphHeadFactory()));

    //--------------------------------------------------------------------------
    // compute edges
    //--------------------------------------------------------------------------

    // replace source and target id by the graph list the corresponding vertex
    DataSet<Tuple3<E, GradoopIdSet, GradoopIdSet>> edgeGraphIdsGraphIds =
      graph.getEdges()
        .join(vertexIdWithGraphIds)
        .where(new SourceId<>()).equalTo(0)
        .with(new JoinEdgeTupleWithSourceGraphs<>())
        .join(vertexIdWithGraphIds)
        .where("f0.targetId").equalTo(0)
        .with(new JoinEdgeTupleWithTargetGraphs<>());

    // add new graph ids to the edges iff source and target are contained in the same graph
    DataSet<E> edges = edgeGraphIdsGraphIds
      .flatMap(new AddNewGraphsToEdge<>());

    //--------------------------------------------------------------------------
    // return new graph collection
    //--------------------------------------------------------------------------

    return graph.getCollectionFactory()
      .fromDataSets(newGraphs, vertices, edges);
  }
}