Apache ignite statefulsets pods abruptly restarts

classic Classic list List threaded Threaded
4 messages Options
Sanjaya Sanjaya
Reply | Threaded
Open this post in threaded view
|

Apache ignite statefulsets pods abruptly restarts

Hi All,

In out production environment, ignite v2.8.1 is install as a kubernetes
stateful sets pods inside Azure Kubernetes cluster. There are 2 pods
running.

Ignite is persistence enabled, with on heap cache only.

The pod is running with below guaranteed resources
Memory : 11 GB
CPU    : 3 core

Ignite is given heap as : 10.25 GB  
The total data region size is as : 8GB


We are getting below error when 2 caches joins each other without any
indexing, one of PODS jvm simply restarts, we are not sure whats going on.
The usecase is that ignite cache grid hold all master data and gets loads
from postgres, and plannned to being called from 30+ differen pods for same
kind of queries.

We are completely stuck in this usecase, and thinking if ignite is right for
this usecase.

 
The stack trace as is below
=============================================================
  AND (A__Z0.ASSET_UID = B__Z1.ASSET_UID))))
ORDER BY 9, 1]
[09:43:10,370][WARNING][jvm-pause-detector-worker][IgniteKernal] Possible
too long JVM pause: 872 milliseconds.
[09:43:10,630][WARNING][client-connector-#52][IgniteH2Indexing] Long running
query is finished [time=4316ms, type=MAP, distributedJoin=false,
enforceJoinOrder=true, lazy=false, schema=CRTX, node=TcpDiscoveryNode
[id=4093191a-f958-4b4b-bf55-ae774d450fa2,
consistentId=4ed84cd6-d24c-4b2e-b61b-e747b0a6e6ba, addrs=ArrayList
[10.188.0.108, 127.0.0.1], sockAddrs=HashSet
[ignite-0.ignite.ignite.svc.cluster.local/10.188.0.108:47500,
/127.0.0.1:47500], discPort=47500, order=2, intOrder=2,
lastExchangeTime=1600681390383, loc=true, ver=2.8.1#20200521-sha1:86422096,
isClient=false], reqId=145, segment=0, sql='SELECT
A__Z0.ASSET_UID __C0_0,
A__Z0.ATTRIBUTE_CODE __C0_1,
B__Z1.TYPE __C0_2,
A__Z0.NUMVALUE __C0_3,
A__Z0.UNIT_SYMBOL __C0_4,
A__Z0.ALNVALUE __C0_5,
A__Z0.CHANGEDATE __C0_6,
B__Z1.CHANGEDATE __C0_7,
A__Z0.ORG_ID __C0_8
FROM CRTX.ASSET B__Z1
 INNER JOIN CRTX.ASSETSPEC A__Z0
 ON TRUE
WHERE (B__Z1.LOCATION_UID = 'R02ERUS010843') AND ((A__Z0.ORG_ID = ?4) AND
(((A__Z0.CHANGEDATE > ?2) OR (B__Z1.CHANGEDATE > ?3)) AND ((B__Z1.TYPE = ?1)
AND (A__Z0.ASSET_UID = B__Z1.ASSET_UID))))
ORDER BY 9, 1', plan=SELECT
    A__Z0.ASSET_UID AS __C0_0,
    A__Z0.ATTRIBUTE_CODE AS __C0_1,
    B__Z1.TYPE AS __C0_2,
    A__Z0.NUMVALUE AS __C0_3,
    A__Z0.UNIT_SYMBOL AS __C0_4,
    A__Z0.ALNVALUE AS __C0_5,
    A__Z0.CHANGEDATE AS __C0_6,
    B__Z1.CHANGEDATE AS __C0_7,
    A__Z0.ORG_ID AS __C0_8
FROM CRTX.ASSET B__Z1
    /* CRTX.ASSET.__SCAN_ */
    /* WHERE (B__Z1.LOCATION_UID = 'R02ERUS010843')
        AND (B__Z1.TYPE = ?1)
    */
    /* scanCount: 377126 */
INNER JOIN CRTX.ASSETSPEC A__Z0
    /* CRTX."_key_PK": ASSET_UID = B__Z1.ASSET_UID */
    ON 1=1
WHERE (B__Z1.LOCATION_UID = 'R02ERUS010843')
    AND ((A__Z0.ORG_ID = ?4)
    AND (((A__Z0.CHANGEDATE > ?2)
    OR (B__Z1.CHANGEDATE > ?3))
    AND ((B__Z1.TYPE = ?1)
    AND (A__Z0.ASSET_UID = B__Z1.ASSET_UID))))
ORDER BY 9, 1]
/opt/ignite/apache-ignite/bin/ignite.sh: line 207:    74 Killed                
"$JAVA" ${JVM_OPTS} ${QUIET} "${RESTART_SUCCESS_OPT}" ${JMX_MON:-}
-DIGNITE_HOME="${IGNITE_HOME}" -DIGNITE_PROG_NAME="$0" ${JVM_XOPTS} -cp
"${CP}" ${MAIN_CLASS} "${CONFIG}"




--
Sent from: http://apache-ignite-users.70518.x6.nabble.com/
ezhuravlev ezhuravlev
Reply | Threaded
Open this post in threaded view
|

Re: Apache ignite statefulsets pods abruptly restarts

There is no such thing as "on heap cache only.". It's possible to enable an additional cache level in heap, but it still will be storing all data in the off heap. So, right now you need at least 10.25+8gb+ Checkpoint buffer size for your Ignite node.

Evgenii

пн, 21 сент. 2020 г. в 09:29, Sanjaya <[hidden email]>:
Hi All,

In out production environment, ignite v2.8.1 is install as a kubernetes
stateful sets pods inside Azure Kubernetes cluster. There are 2 pods
running.

Ignite is persistence enabled, with on heap cache only.

The pod is running with below guaranteed resources
Memory : 11 GB
CPU    : 3 core

Ignite is given heap as : 10.25 GB 
The total data region size is as : 8GB


We are getting below error when 2 caches joins each other without any
indexing, one of PODS jvm simply restarts, we are not sure whats going on.
The usecase is that ignite cache grid hold all master data and gets loads
from postgres, and plannned to being called from 30+ differen pods for same
kind of queries.

We are completely stuck in this usecase, and thinking if ignite is right for
this usecase.


The stack trace as is below
=============================================================
  AND (A__Z0.ASSET_UID = B__Z1.ASSET_UID))))
ORDER BY 9, 1]
[09:43:10,370][WARNING][jvm-pause-detector-worker][IgniteKernal] Possible
too long JVM pause: 872 milliseconds.
[09:43:10,630][WARNING][client-connector-#52][IgniteH2Indexing] Long running
query is finished [time=4316ms, type=MAP, distributedJoin=false,
enforceJoinOrder=true, lazy=false, schema=CRTX, node=TcpDiscoveryNode
[id=4093191a-f958-4b4b-bf55-ae774d450fa2,
consistentId=4ed84cd6-d24c-4b2e-b61b-e747b0a6e6ba, addrs=ArrayList
[10.188.0.108, 127.0.0.1], sockAddrs=HashSet
[ignite-0.ignite.ignite.svc.cluster.local/10.188.0.108:47500,
/127.0.0.1:47500], discPort=47500, order=2, intOrder=2,
lastExchangeTime=1600681390383, loc=true, ver=2.8.1#20200521-sha1:86422096,
isClient=false], reqId=145, segment=0, sql='SELECT
A__Z0.ASSET_UID __C0_0,
A__Z0.ATTRIBUTE_CODE __C0_1,
B__Z1.TYPE __C0_2,
A__Z0.NUMVALUE __C0_3,
A__Z0.UNIT_SYMBOL __C0_4,
A__Z0.ALNVALUE __C0_5,
A__Z0.CHANGEDATE __C0_6,
B__Z1.CHANGEDATE __C0_7,
A__Z0.ORG_ID __C0_8
FROM CRTX.ASSET B__Z1
 INNER JOIN CRTX.ASSETSPEC A__Z0
 ON TRUE
WHERE (B__Z1.LOCATION_UID = 'R02ERUS010843') AND ((A__Z0.ORG_ID = ?4) AND
(((A__Z0.CHANGEDATE > ?2) OR (B__Z1.CHANGEDATE > ?3)) AND ((B__Z1.TYPE = ?1)
AND (A__Z0.ASSET_UID = B__Z1.ASSET_UID))))
ORDER BY 9, 1', plan=SELECT
    A__Z0.ASSET_UID AS __C0_0,
    A__Z0.ATTRIBUTE_CODE AS __C0_1,
    B__Z1.TYPE AS __C0_2,
    A__Z0.NUMVALUE AS __C0_3,
    A__Z0.UNIT_SYMBOL AS __C0_4,
    A__Z0.ALNVALUE AS __C0_5,
    A__Z0.CHANGEDATE AS __C0_6,
    B__Z1.CHANGEDATE AS __C0_7,
    A__Z0.ORG_ID AS __C0_8
FROM CRTX.ASSET B__Z1
    /* CRTX.ASSET.__SCAN_ */
    /* WHERE (B__Z1.LOCATION_UID = 'R02ERUS010843')
        AND (B__Z1.TYPE = ?1)
    */
    /* scanCount: 377126 */
INNER JOIN CRTX.ASSETSPEC A__Z0
    /* CRTX."_key_PK": ASSET_UID = B__Z1.ASSET_UID */
    ON 1=1
WHERE (B__Z1.LOCATION_UID = 'R02ERUS010843')
    AND ((A__Z0.ORG_ID = ?4)
    AND (((A__Z0.CHANGEDATE > ?2)
    OR (B__Z1.CHANGEDATE > ?3))
    AND ((B__Z1.TYPE = ?1)
    AND (A__Z0.ASSET_UID = B__Z1.ASSET_UID))))
ORDER BY 9, 1]
/opt/ignite/apache-ignite/bin/ignite.sh: line 207:    74 Killed                 
"$JAVA" ${JVM_OPTS} ${QUIET} "${RESTART_SUCCESS_OPT}" ${JMX_MON:-}
-DIGNITE_HOME="${IGNITE_HOME}" -DIGNITE_PROG_NAME="$0" ${JVM_XOPTS} -cp
"${CP}" ${MAIN_CLASS} "${CONFIG}"




--
Sent from: http://apache-ignite-users.70518.x6.nabble.com/
Sanjaya Sanjaya
Reply | Threaded
Open this post in threaded view
|

Re: Apache ignite statefulsets pods abruptly restarts

Hi, 

Thanks for your reply, we have set on heap as below in ignite configuration xml,  https://apacheignite.readme.io/docs/memory-configuration#section-on-heap-caching

Basically we have set as true in  xml configuration as  <property name="onheapCacheEnabled" value="true">.

The idea to move on heap with persistence enable is to maximize the latency of query use case

Am I doing anything wrong here ?


Thanks,
Sanjaya 

On Mon, Sep 21, 2020 at 10:11 PM Evgenii Zhuravlev <[hidden email]> wrote:
There is no such thing as "on heap cache only.". It's possible to enable an additional cache level in heap, but it still will be storing all data in the off heap. So, right now you need at least 10.25+8gb+ Checkpoint buffer size for your Ignite node.

Evgenii

пн, 21 сент. 2020 г. в 09:29, Sanjaya <[hidden email]>:
Hi All,

In out production environment, ignite v2.8.1 is install as a kubernetes
stateful sets pods inside Azure Kubernetes cluster. There are 2 pods
running.

Ignite is persistence enabled, with on heap cache only.

The pod is running with below guaranteed resources
Memory : 11 GB
CPU    : 3 core

Ignite is given heap as : 10.25 GB 
The total data region size is as : 8GB


We are getting below error when 2 caches joins each other without any
indexing, one of PODS jvm simply restarts, we are not sure whats going on.
The usecase is that ignite cache grid hold all master data and gets loads
from postgres, and plannned to being called from 30+ differen pods for same
kind of queries.

We are completely stuck in this usecase, and thinking if ignite is right for
this usecase.


The stack trace as is below
=============================================================
  AND (A__Z0.ASSET_UID = B__Z1.ASSET_UID))))
ORDER BY 9, 1]
[09:43:10,370][WARNING][jvm-pause-detector-worker][IgniteKernal] Possible
too long JVM pause: 872 milliseconds.
[09:43:10,630][WARNING][client-connector-#52][IgniteH2Indexing] Long running
query is finished [time=4316ms, type=MAP, distributedJoin=false,
enforceJoinOrder=true, lazy=false, schema=CRTX, node=TcpDiscoveryNode
[id=4093191a-f958-4b4b-bf55-ae774d450fa2,
consistentId=4ed84cd6-d24c-4b2e-b61b-e747b0a6e6ba, addrs=ArrayList
[10.188.0.108, 127.0.0.1], sockAddrs=HashSet
[ignite-0.ignite.ignite.svc.cluster.local/10.188.0.108:47500,
/127.0.0.1:47500], discPort=47500, order=2, intOrder=2,
lastExchangeTime=1600681390383, loc=true, ver=2.8.1#20200521-sha1:86422096,
isClient=false], reqId=145, segment=0, sql='SELECT
A__Z0.ASSET_UID __C0_0,
A__Z0.ATTRIBUTE_CODE __C0_1,
B__Z1.TYPE __C0_2,
A__Z0.NUMVALUE __C0_3,
A__Z0.UNIT_SYMBOL __C0_4,
A__Z0.ALNVALUE __C0_5,
A__Z0.CHANGEDATE __C0_6,
B__Z1.CHANGEDATE __C0_7,
A__Z0.ORG_ID __C0_8
FROM CRTX.ASSET B__Z1
 INNER JOIN CRTX.ASSETSPEC A__Z0
 ON TRUE
WHERE (B__Z1.LOCATION_UID = 'R02ERUS010843') AND ((A__Z0.ORG_ID = ?4) AND
(((A__Z0.CHANGEDATE > ?2) OR (B__Z1.CHANGEDATE > ?3)) AND ((B__Z1.TYPE = ?1)
AND (A__Z0.ASSET_UID = B__Z1.ASSET_UID))))
ORDER BY 9, 1', plan=SELECT
    A__Z0.ASSET_UID AS __C0_0,
    A__Z0.ATTRIBUTE_CODE AS __C0_1,
    B__Z1.TYPE AS __C0_2,
    A__Z0.NUMVALUE AS __C0_3,
    A__Z0.UNIT_SYMBOL AS __C0_4,
    A__Z0.ALNVALUE AS __C0_5,
    A__Z0.CHANGEDATE AS __C0_6,
    B__Z1.CHANGEDATE AS __C0_7,
    A__Z0.ORG_ID AS __C0_8
FROM CRTX.ASSET B__Z1
    /* CRTX.ASSET.__SCAN_ */
    /* WHERE (B__Z1.LOCATION_UID = 'R02ERUS010843')
        AND (B__Z1.TYPE = ?1)
    */
    /* scanCount: 377126 */
INNER JOIN CRTX.ASSETSPEC A__Z0
    /* CRTX."_key_PK": ASSET_UID = B__Z1.ASSET_UID */
    ON 1=1
WHERE (B__Z1.LOCATION_UID = 'R02ERUS010843')
    AND ((A__Z0.ORG_ID = ?4)
    AND (((A__Z0.CHANGEDATE > ?2)
    OR (B__Z1.CHANGEDATE > ?3))
    AND ((B__Z1.TYPE = ?1)
    AND (A__Z0.ASSET_UID = B__Z1.ASSET_UID))))
ORDER BY 9, 1]
/opt/ignite/apache-ignite/bin/ignite.sh: line 207:    74 Killed                 
"$JAVA" ${JVM_OPTS} ${QUIET} "${RESTART_SUCCESS_OPT}" ${JMX_MON:-}
-DIGNITE_HOME="${IGNITE_HOME}" -DIGNITE_PROG_NAME="$0" ${JVM_XOPTS} -cp
"${CP}" ${MAIN_CLASS} "${CONFIG}"




--
Sent from: http://apache-ignite-users.70518.x6.nabble.com/
stephendarlington stephendarlington
Reply | Threaded
Open this post in threaded view
|

Re: Apache ignite statefulsets pods abruptly restarts

The on-heap cache is in addition to the off-heap cache.

8Gb (off-heap)
+ 8Gb (on-heap cache)
+ ~4Gb (heap space for running Ignite)
= ~ 20Gb

Way more than the 11Gb you have allocated.

Three steps:
  • Turn off the on-heap cache
  • Decrease the data region to maybe 6Gb (in Ignite configuration)
  • Allocate around 4Gb to your heap (by setting JVM_OPTS)

On 22 Sep 2020, at 06:10, Sanjaya Kumar Sahoo <[hidden email]> wrote:

Hi, 

Thanks for your reply, we have set on heap as below in ignite configuration xml,  https://apacheignite.readme.io/docs/memory-configuration#section-on-heap-caching

Basically we have set as true in  xml configuration as  <property name="onheapCacheEnabled" value="true">.

The idea to move on heap with persistence enable is to maximize the latency of query use case

Am I doing anything wrong here ?


Thanks,
Sanjaya 

On Mon, Sep 21, 2020 at 10:11 PM Evgenii Zhuravlev <[hidden email]> wrote:
There is no such thing as "on heap cache only.". It's possible to enable an additional cache level in heap, but it still will be storing all data in the off heap. So, right now you need at least 10.25+8gb+ Checkpoint buffer size for your Ignite node.

Evgenii

пн, 21 сент. 2020 г. в 09:29, Sanjaya <[hidden email]>:
Hi All,

In out production environment, ignite v2.8.1 is install as a kubernetes
stateful sets pods inside Azure Kubernetes cluster. There are 2 pods
running.

Ignite is persistence enabled, with on heap cache only.

The pod is running with below guaranteed resources
Memory : 11 GB
CPU    : 3 core

Ignite is given heap as : 10.25 GB 
The total data region size is as : 8GB


We are getting below error when 2 caches joins each other without any
indexing, one of PODS jvm simply restarts, we are not sure whats going on.
The usecase is that ignite cache grid hold all master data and gets loads
from postgres, and plannned to being called from 30+ differen pods for same
kind of queries.

We are completely stuck in this usecase, and thinking if ignite is right for
this usecase.


The stack trace as is below
=============================================================
  AND (A__Z0.ASSET_UID = B__Z1.ASSET_UID))))
ORDER BY 9, 1]
[09:43:10,370][WARNING][jvm-pause-detector-worker][IgniteKernal] Possible
too long JVM pause: 872 milliseconds.
[09:43:10,630][WARNING][client-connector-#52][IgniteH2Indexing] Long running
query is finished [time=4316ms, type=MAP, distributedJoin=false,
enforceJoinOrder=true, lazy=false, schema=CRTX, node=TcpDiscoveryNode
[id=4093191a-f958-4b4b-bf55-ae774d450fa2,
consistentId=4ed84cd6-d24c-4b2e-b61b-e747b0a6e6ba, addrs=ArrayList
[10.188.0.108, 127.0.0.1], sockAddrs=HashSet
[ignite-0.ignite.ignite.svc.cluster.local/10.188.0.108:47500,
/127.0.0.1:47500], discPort=47500, order=2, intOrder=2,
lastExchangeTime=1600681390383, loc=true, ver=2.8.1#20200521-sha1:86422096,
isClient=false], reqId=145, segment=0, sql='SELECT
A__Z0.ASSET_UID __C0_0,
A__Z0.ATTRIBUTE_CODE __C0_1,
B__Z1.TYPE __C0_2,
A__Z0.NUMVALUE __C0_3,
A__Z0.UNIT_SYMBOL __C0_4,
A__Z0.ALNVALUE __C0_5,
A__Z0.CHANGEDATE __C0_6,
B__Z1.CHANGEDATE __C0_7,
A__Z0.ORG_ID __C0_8
FROM CRTX.ASSET B__Z1
 INNER JOIN CRTX.ASSETSPEC A__Z0
 ON TRUE
WHERE (B__Z1.LOCATION_UID = 'R02ERUS010843') AND ((A__Z0.ORG_ID = ?4) AND
(((A__Z0.CHANGEDATE > ?2) OR (B__Z1.CHANGEDATE > ?3)) AND ((B__Z1.TYPE = ?1)
AND (A__Z0.ASSET_UID = B__Z1.ASSET_UID))))
ORDER BY 9, 1', plan=SELECT
    A__Z0.ASSET_UID AS __C0_0,
    A__Z0.ATTRIBUTE_CODE AS __C0_1,
    B__Z1.TYPE AS __C0_2,
    A__Z0.NUMVALUE AS __C0_3,
    A__Z0.UNIT_SYMBOL AS __C0_4,
    A__Z0.ALNVALUE AS __C0_5,
    A__Z0.CHANGEDATE AS __C0_6,
    B__Z1.CHANGEDATE AS __C0_7,
    A__Z0.ORG_ID AS __C0_8
FROM CRTX.ASSET B__Z1
    /* CRTX.ASSET.__SCAN_ */
    /* WHERE (B__Z1.LOCATION_UID = 'R02ERUS010843')
        AND (B__Z1.TYPE = ?1)
    */
    /* scanCount: 377126 */
INNER JOIN CRTX.ASSETSPEC A__Z0
    /* CRTX."_key_PK": ASSET_UID = B__Z1.ASSET_UID */
    ON 1=1
WHERE (B__Z1.LOCATION_UID = 'R02ERUS010843')
    AND ((A__Z0.ORG_ID = ?4)
    AND (((A__Z0.CHANGEDATE > ?2)
    OR (B__Z1.CHANGEDATE > ?3))
    AND ((B__Z1.TYPE = ?1)
    AND (A__Z0.ASSET_UID = B__Z1.ASSET_UID))))
ORDER BY 9, 1]
/opt/ignite/apache-ignite/bin/ignite.sh: line 207:    74 Killed                 
"$JAVA" ${JVM_OPTS} ${QUIET} "${RESTART_SUCCESS_OPT}" ${JMX_MON:-}
-DIGNITE_HOME="${IGNITE_HOME}" -DIGNITE_PROG_NAME="$0" ${JVM_XOPTS} -cp
"${CP}" ${MAIN_CLASS} "${CONFIG}"




--
Sent from: http://apache-ignite-users.70518.x6.nabble.com/